import os.path import re from gensim.corpora import sources, dmlcorpus PREFIX = 'dmlcz' AT_HOME = False if AT_HOME: SOURCE_LIST = [ sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'), sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'), sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'), ] # SOURCE_LIST = [ # sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'), # ] RESULT_DIR = '/Users/kofola/workspace/dml/data/results' else: SOURCE_LIST = [ sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'), sources.DmlSource('numdam', '/data/dmlcz/data/numdam'), sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'), ]
# sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'), # sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'), # sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'), # ] SOURCE_LIST = [ sources.DmlCzSource( 'dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'), ] RESULT_DIR = '/Users/kofola/workspace/dml/data/results' else: SOURCE_LIST = [ sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'), sources.DmlSource('numdam', '/data/dmlcz/data/numdam'), sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'), ] RESULT_DIR = '/data/dmlcz/xrehurek/results' def buildDmlCorpus(config): dml = dmlcorpus.DmlCorpus() dml.processConfig(config, shuffle=True) dml.buildDictionary() dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words dml.save( config.resultFile('.pkl') ) # save the mappings as binary data (actual documents are not saved, only their uris)