Ejemplo n.º 1
0
 def expand(self, query):
     tokens = tokenize(query.lower())
     latent = self.lda.tokens2latent(tokens)
     extra_terms = []
     for topic in latent:
         topn = self.lda.model.show_topic(topicid=topic[0], topn=round(self.k*topic[1]))
         extra_terms += [e[1] for e in topn]
     extra_terms = list(set(extra_terms))
     new_query = query + " " + " ".join(extra_terms)
     return new_query
Ejemplo n.º 2
0
 def expand(self, query):
     tokens = tokenize(query.lower())
     latent = self.lda.tokens2latent(tokens)
     extra_terms = []
     for topic in latent:
         topn = self.lda.model.show_topic(topicid=topic[0],
                                          topn=round(self.k * topic[1]))
         extra_terms += [e[1] for e in topn]
     extra_terms = list(set(extra_terms))
     new_query = query + " " + " ".join(extra_terms)
     return new_query
Ejemplo n.º 3
0
def create_corpus():
    data_folder = os.path.join(*[os.path.dirname(__file__), 'data', 'corpora'])

    docs = []
    count = 1
    max_count = 50000
    for case in CaseReportLibrary():
        # lower case all text (1)
        text = case.get_text()
        tokens = tokenize(text)
        docs.append(tokens)
        count += 1
        if count % 100 == 0:
            print count,"/",max_count
        if count >= max_count:
            break

    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    dictionary.save(os.path.join(data_folder, 'raw.dict'))
    corpora.MmCorpus.serialize(os.path.join(data_folder, 'raw.mm'), corpus)
Ejemplo n.º 4
0
def create_corpus():
    data_folder = os.path.join(*[os.path.dirname(__file__), 'data', 'corpora'])

    docs = []
    count = 1
    max_count = 50000
    for case in CaseReportLibrary():
        # lower case all text (1)
        text = case.get_text()
        tokens = tokenize(text)
        docs.append(tokens)
        count += 1
        if count % 100 == 0:
            print count, "/", max_count
        if count >= max_count:
            break

    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    dictionary.save(os.path.join(data_folder, 'raw.dict'))
    corpora.MmCorpus.serialize(os.path.join(data_folder, 'raw.mm'), corpus)