def buildDmlCorpus(config, language): dml = DmlCorpus() dml.processConfig(config, shuffle=True) dml.buildDictionary() dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words dml.save( config.resultFile('.pkl') ) # save the mappings as binary data (actual documents are not saved, only their uris) dml.saveAsText( ) # save id mappings and documents as text data (matrix market format) return dml