Exemple #1
0
import sys
import os.path
import re


from gensim.corpora import sources, dmlcorpus


PREFIX = 'dmlcz'

AT_HOME = False

if AT_HOME:
    SOURCE_LIST = [
                   sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'),
                   sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'),
                   sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'),
                   ]

#    SOURCE_LIST = [
#                   sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'),
#                   ]

    RESULT_DIR = '/Users/kofola/workspace/dml/data/results'

else:

    SOURCE_LIST = [
                   sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'),
                   sources.DmlSource('numdam', '/data/dmlcz/data/numdam'),
                   sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'),
Exemple #2
0
    #    SOURCE_LIST = [
    #                   sources.DmlCzSource('dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/'),
    #                   sources.DmlSource('numdam', '/Users/kofola/workspace/dml/data/numdam/'),
    #                   sources.ArxmlivSource('arxmliv', '/Users/kofola/workspace/dml/data/arxmliv/'),
    #                   ]

    SOURCE_LIST = [
        sources.DmlCzSource(
            'dmlcz', '/Users/kofola/workspace/dml/data/dmlcz/CzechMathJ'),
    ]

    RESULT_DIR = '/Users/kofola/workspace/dml/data/results'
else:
    SOURCE_LIST = [
        sources.DmlCzSource('dmlcz', '/data/dmlcz/data/share'),
        sources.DmlSource('numdam', '/data/dmlcz/data/numdam'),
        sources.ArxmlivSource('arxmliv', '/data/dmlcz/data/arxmliv'),
    ]

    RESULT_DIR = '/data/dmlcz/xrehurek/results'


def buildDmlCorpus(config):
    dml = dmlcorpus.DmlCorpus()
    dml.processConfig(config, shuffle=True)
    dml.buildDictionary()
    dml.dictionary.filterExtremes(noBelow=5,
                                  noAbove=0.3)  # ignore too (in)frequent words

    dml.save(
        config.resultFile('.pkl')