Example #1
0
def buildDmlCorpus(config, language):
    dml = DmlCorpus()
    dml.processConfig(config, shuffle=True)
    dml.buildDictionary()
    dml.dictionary.filterExtremes(noBelow=5,
                                  noAbove=0.3)  # ignore too (in)frequent words

    dml.save(
        config.resultFile('.pkl')
    )  # save the mappings as binary data (actual documents are not saved, only their uris)
    dml.saveAsText(
    )  # save id mappings and documents as text data (matrix market format)
    return dml