コード例 #1
0
def croelectModelsContext():
    ctx = Context('croelect_models')
    folder = FolderLocation(croelect_models_folder)
    for f in ['model1', 'model2', 'model3', 'model4']:
        m = createWrappedCroelectModel('croelect_%s' % f, folder(f))
        ctx.add(m)
    return ctx
コード例 #2
0
def croelectMiscResourceContext():
    from doc_topic_coh.settings import cro_wiki_lucene
    ctx = Context('croelect_resources')
    ctx['crowiki_palmetto_index'] = cro_wiki_lucene
    from doc_topic_coh.resources.croelect_resources.preprocess import CroelectSwRemover
    from pytopia.nlp.text2tokens.gtar.text2tokens import alphanumStopwordsTokenizer
    crotok = alphanumStopwordsTokenizer(CroelectSwRemover())
    crotok.id = 'croelect_alphanum_stopword_tokenizer'
    ctx.add(crotok)
    return ctx
コード例 #3
0
def getGtarCorpusContext():
    ctx = Context('gtar_corpus_context')
    builder = GtarCorpusBuilder('rsssucker_topus1_13042015', resfolder,
                                '2015-01-26 00:00:00')
    for name in dir(builder):
        if name.startswith('corpus', 0):
            method = getattr(builder, name)
            corpus = method()
            ctx.add(corpus)
    return ctx
コード例 #4
0
def getCroelectCorpusContext():
    ctx = Context('croelect_corpus_context')
    f = Feedlist(path.join(resfolder, 'iter0_cronews_feeds.txt'))
    dbase = 'rsssucker_croelect_14012015_iter0filled'
    fsetcorpus = FeedsetCorpus(dbase,
                               f,
                               startDate='2015-09-30 00:00:00',
                               endDate='2015-12-28 23:59:59')
    dupfilter = DuplicateTextFilter(FeedsuckerCorpus(dbase))
    sizeFilter = TextsizeFilter(40, alphanumTokenizer())
    corpus = FilteredCorpus('iter0_cronews_final', fsetcorpus,
                            [sizeFilter, dupfilter])
    ctx.add(corpus)
    return ctx
コード例 #5
0
def gtarText2TokensContext():
    from pytopia.nlp.text2tokens.gtar.text2tokens import RsssuckerTxt2Tokens
    from pytopia.nlp.text2tokens.regexp import alphanumTokenizer
    from pytopia.nlp.text2tokens.gtar.text2tokens import alphanumStopwordsTokenizer
    from pytopia.nlp.text2tokens.gtar.stopwords import RsssuckerSwRemover
    ctx = Context('gtar_text2tokens_context')
    ctx.add(RsssuckerTxt2Tokens())
    ctx.add(alphanumTokenizer())
    alphasw = alphanumStopwordsTokenizer(RsssuckerSwRemover())
    alphasw.id = 'alphanum_gtar_stopword_tokenizer'
    ctx.add(alphasw)
    return ctx
コード例 #6
0
def basicBuildersContext(cacheFolder):
    '''
    Create context with cached basic resource builders.
    Cache folders of the builders are subfolders of the cacheFolder
     named after the resource type.
    '''
    ctx = Context('pytopia_basic_builders_context')
    cf = FolderLocation(cacheFolder)
    ctx.add(
        cachedResourceBuilder(CorpusIndexBuilder(),
                              cf('corpus_index'),
                              id='corpus_index_builder'))
    ctx.add(
        cachedResourceBuilder(WordDocIndexBuilder(),
                              cf('worddoc_index'),
                              id='worddoc_index_builder'))
    ctx.add(
        cachedResourceBuilder(BowCorpusBuilder(),
                              cf('bow_corpus'),
                              id='bow_corpus_builder'))
    ctx.add(
        cachedResourceBuilder(CorpusTfidfBuilder(),
                              cf('corpus_tfidf_index'),
                              id='corpus_tfidf_builder'))
    ctx.add(
        cachedResourceBuilder(CorpusTextVectorsBuilder,
                              cf('corpus_text_vectors'),
                              id='corpus_text_vectors_builder'))
    ctx.add(
        cachedResourceBuilder(CorpusTopicIndexBuilder(),
                              cf('corpus_topic_index'),
                              id='corpus_topic_index_builder'))
    ctx.add(
        cachedResourceBuilder(InverseTokenizerBuilder,
                              cf('inverse_tokenization'),
                              id='inverse_tokenizer_builder'))
    return ctx
コード例 #7
0
def palmettoContext():
    from doc_topic_coh.settings import english_wiki_lucene
    ctx = Context('palmetto_context')
    ctx['wiki_docs'] = english_wiki_lucene
    return ctx
コード例 #8
0
def __pycoverexpContext():
    ctx = Context('doc_topic_coherence_context')
    ctx.merge(getGtarCorpusContext())
    ctx.merge(gtarDictionaryContext())
    ctx.merge(gtarModelsContext())
    ctx.merge(gtarText2TokensContext())
    ctx.merge(palmettoContext())
    ctx.add(corpusIndexBuilder())
    ctx.add(corpusTfidfBuilder())
    ctx.add(corpusTopicIndexBuilder())
    ctx.add(wordDocIndexBuilder())
    ctx.add(bowCorpusBuilder())
    ctx.add(word2vecBuilder())
    ctx.add(gloveVecBuilder())
    ctx.add(inverseTokenizerBuilder())
    ctx.add(corpusTextVectorsBuilder())
    # croelect resources
    ctx.merge(getCroelectCorpusContext())
    ctx.merge(croelectModelsContext())
    ctx.merge(croelectMiscResourceContext())
    ctx.add(croelectDictionary())
    ctx.add(croelectText2Tokens())
    return ctx
コード例 #9
0
def basicTokenizersContext():
    ctx = Context('basic_tokenizers')
    ctx.add(whitespaceTokenizer())
    ctx.add(alphanumTokenizer())
    ctx.add(wordTokenizer())
    return ctx
コード例 #10
0
def gtarModelsContext():
    ctx = Context('gtar_models_context')
    for mid, mfolder in modelId2Folder.iteritems():
        f = path.join(uspol_models_folder, mfolder)
        ctx.add(createWrappedGensimModel(mid, f))
    return ctx