Python Dictionary.load_from_textの例、gensim.corpora.dictionary.Dictionary.load_from_text Pythonの例

コード例 #1

0

ファイルを表示

ファイル: LDAEmbedding.py プロジェクト: dbis-trier-university/Semantometrics

def make_lda(stem):
    ps = PorterStemmer()

    if stem:
        lda = ldamodel.LdaModel.load(get_file_base() +
                                     'lda_data/lda_model_stemmed')
        dictionary = Dictionary.load_from_text(get_file_base() +
                                               'lda_data/dict_stemmed')

        sem_raw, sem_in, sem_out = read_in(get_seminal_s(), 'seminal')
        sur_raw, sur_in, sur_out = read_in(get_survey_s(), 'survey')
        uni_raw, uni_in, uni_out = read_in(get_uninfluential_s(),
                                           'uninfluential')
    else:
        lda = ldamodel.LdaModel.load(get_file_base() +
                                     'lda_data/lda_model_unstemmed')
        dictionary = Dictionary.load_from_text(get_file_base() +
                                               'lda_data/dict_unstemmed')

        sem_raw, sem_in, sem_out = read_in(get_seminal_u(), 'seminal')
        sur_raw, sur_in, sur_out = read_in(get_survey_u(), 'survey')
        uni_raw, uni_in, uni_out = read_in(get_uninfluential_u(),
                                           'uninfluential')

    # write lda information to file
    if stem:
        write_to_file(
            get_file_base() + 'lda_data/sem_lda_stemmed.json',
            get_file_base() + 'lda_data/sem_lda_stemmed_one_doc_rep.json',
            sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/sur_lda_stemmed.json',
            get_file_base() + 'lda_data/sur_lda_stemmed_one_doc_rep.json',
            sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/uni_lda_stemmed.json',
            get_file_base() + 'lda_data/uni_lda_stemmed_one_doc_rep.json',
            uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary)
    else:
        write_to_file(
            get_file_base() + 'lda_data/sem_lda_unstemmed.json',
            get_file_base() + 'lda_data/sem_lda_unstemmed_one_doc_rep.json',
            sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/sur_lda_unstemmed.json',
            get_file_base() + 'lda_data/sur_lda_unstemmed_one_doc_rep.json',
            sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary)

        write_to_file(
            get_file_base() + 'lda_data/uni_lda_unstemmed.json',
            get_file_base() + 'lda_data/uni_lda_unstemmed_one_doc_rep.json',
            uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary)

コード例 #2

0

ファイルを表示

ファイル: graph_tools.py プロジェクト: cosbynator/karma-prediction-cs224w

def user_lda(lda, dictionary_path, textyielder):
    id2word = Dictionary.load_from_text(dictionary_path)
    ret = {}
    for user, text in text_yielder():
        bow = id2word.doc2bow(UserCorpus.text2tokens(text))
        ret[user] = lda[bow]
    return ret

コード例 #3

0

ファイルを表示

ファイル: multimodal_dataloader.py プロジェクト: mayu-ot/visually-grounded-paraphrases

    def __post_init__(self) -> None:
        pairs: pd.DataFrame = self.load_dataset_file(self.dataset_file)

        dct = Dictionary.load_from_text(("data/processed/dictionary.txt"))

        self.phrase_a = self.preprocess_phrase(pairs["phrase_a"], dct)
        self.phrase_b = self.preprocess_phrase(pairs["phrase_b"], dct)

コード例 #4

0

ファイルを表示

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--use_domain", action="store_true")
    parser.add_argument("--update", action="store_true")
    args = parser.parse_args()

    common_dict = Dictionary.load_from_text("./common_dict.txt")

    f = open("url2bow_map.csv", "a")
    for i, url in enumerate(sys.stdin):
        print("url " + str(i))
        text = fetch_contents_from_url(url.strip(), use_domain=args.use_domain)
        if not text:
            continue

        word_list = doc2word_list(text)
        bow = common_dict.doc2bow(word_list)
        if bow:
            print(bow)
            for b in bow:
                f.write(url.strip() + "," + str(b[0]) + "," + str(b[1]) + "\n")
        if i % 100 == 99:
            f.close()
            f = open("url2bow_map.csv", "a")
    f.close()

コード例 #5

0

ファイルを表示

ファイル: lda.py プロジェクト: AyaRamazanova/LDA

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-dir',
                        default='./data/test_arxiv_plain.txt',
                        help='Path to directory where the data is stored')
    parser.add_argument('--model-dir',
                        default='../model',
                        help='Path to directory where the model is stored')
    parser.add_argument('--train',
                        default=True,
                        help='True for train, False for test mode')
    parser.add_argument('--n_topic', default=20, help='Number of of topics')
    args = parser.parse_args()
    model_dir = './model/model'
    dict_dir = './model/dict.txt'

    if args.train == True:
        print('Reading texts')
        with open(args.data_dir) as f_in:
            texts = f_in.read().split('\n')
        del texts[-1]
        for i in tqdm(range(len(texts))):
            texts[i] = texts[i].split()

        print('Generating corpora')
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary.save_as_text(dict_dir)

        print('Loading model')
        lda = LdaModel(corpus, num_topics=args.n_topic)
        lda.save(model_dir)
    else:
        lda = LdaModel.load(model_dir, mmap='r')
        dictionary = Dictionary()
        dictionary.load_from_text(dict_dir)

    print('Processing results')
    topics = lda.print_topics()
    with open('./report.txt', 'w') as f_out:
        for topic_id, topic_pair in topics:
            print(topic_id, end=': ', file=f_out)
            topic_words = topic_pair.split('"')[1::2]
            topic_words = list(map(int, topic_words))
            topic_words = [dictionary.get(word) for word in topic_words]
            print(topic_words, file=f_out)

コード例 #6

0

ファイルを表示

ファイル: corpus.py プロジェクト: dshahaf/snap-sentiment

 def SNAP_id2word(self):
   path = os.path.join(
     os.path.dirname(os.path.abspath(__file__)),
     'snap_data',
     'gensim_snap_dict.txt'
   )
   # self.myLoadFromText(path)
   ret = Dictionary.load_from_text(path)
   return ret

コード例 #7

0

ファイルを表示

ファイル: content_engine.py プロジェクト: charles-vdulac/japanese-content-engine

    def load(self):
        """
        load the corpora created by `make_corpus.py`
        """
        self.corpus = MmCorpus(self.corpus_file)
        self.dictionary = Dictionary.load_from_text(self.dict_file)
        self.titles = load_titles(self.title_file)

        self.tfidf_model = TfidfModel.load(self.tfidf_model_file)
        self.index = MatrixSimilarity(self.tfidf_model[self.corpus])

コード例 #8

0

ファイルを表示

def saveLDACorpus(train_data_path,test_data_path,model_file,dictionary_file,corpus_file):
        ""
        lda = LdaModel.load(model_file)
        dictionary = Dictionary.load_from_text(dictionary_file)
        dictionary.id2token = utils.revdict(dictionary.token2id)
        src_df = pd.read_csv(corpus_file)
        src_df = parallelize(src_df, data_fram_proc1,dictionary ,lda) #计入ida特征
        train_data, test_data = train_test_split(src_df[['label','multiLabels','item']], test_size=0.2, random_state=42)
        train_data.to_csv(train_data_path,  index=None )#, header=None
        test_data.to_csv(test_data_path, index=None )#, header=None

コード例 #9

0

ファイルを表示

ファイル: corpora.py プロジェクト: EemeliSaari/DeepTrends

    def __init__(self,
                 data_path: str,
                 prefix: str = None,
                 iterator: str = 'token',
                 parsing: str = 'simple',
                 word_up_limit: float = 0.75,
                 word_low_limit: int = 20,
                 dictionary: str = None,
                 shuffle: bool = False,
                 seed: int = 42,
                 document_minimum_length: int = 5,
                 stopwords: str = None):

        iter_map = dict(token=self.tokenize,
                        bow=self.bowize,
                        sentences=self.sentences)
        self.iterator = iter_map[iterator]

        self.word_low_limit = word_low_limit
        self.word_up_limit = word_up_limit

        if stopwords:
            self.stopwords = [w.strip() for w in open(stopwords).readlines()]
        else:
            self.stopwords = []

        if not dictionary:
            self.dictionary = Dictionary()
        else:
            self.dictionary = Dictionary.load_from_text(dictionary)
            if self.stopwords:
                self.dictionary.filter_tokens(
                    bad_ids=self.dictionary.doc2idx(self.stopwords))
            self.is_built = True

        self.shuffle = shuffle
        if self.shuffle:
            np.random.seed(seed)

        self.document_minimum_length = document_minimum_length

        corpus = self.init_corpus(data_path, prefix, parsing)

        super(Corpora, self).__init__(corpus=corpus)

コード例 #10

0

ファイルを表示

ファイル: sample.py プロジェクト: phdowling/vector-models-eval

def main():
    try:
        dictionary = Dictionary.load_from_text("dictionary.txt")
    except:
        dictionary = Dictionary(rcv1_train)
        dictionary.filter_extremes()
        dictionary.save_as_text("dictionary.txt")

    class RCV1BowCorpus(object):
        def __iter__(self):
            for document in rcv1_train:
                yield dictionary.doc2bow(document)

    ln.debug("Training model on %s documents" % len(rcv1_train))
    try:
        vector_model = LsiModel.load("lsi_model")
    except:
        vector_model = LsiModel(corpus=RCV1BowCorpus(),
                                num_topics=100,
                                id2word=dictionary)
        vector_model.save("lsi_model")

    def get_lsi_features(text):
        """
        Must return either numpy array or dictionary
        """
        res = vector_model[dictionary.doc2bow(text)]
        return dict(res)

    def get_bow_features(text):
        return dict(dictionary.doc2bow(text))

    clf = train_classifier(train_samples=rcv1_train,
                           train_targets=rcv1_train_target,
                           get_features=get_lsi_features,
                           classifier="sgd")

    evaluate_classifier(clf,
                        rcv1_test,
                        rcv1_test_target,
                        get_features=get_lsi_features)

コード例 #11

0

ファイルを表示

def create_tfidf_corpus(corpus_file, dict_file, outputs_dir):

    # Load back the id->word mapping directly from file
    # This seems to save more memory, compared to keeping the
    # wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(dict_file)

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(corpus_file)

    tfidf_model_file = os.path.join(outputs_dir, "wikipedia.tfidf_model")
    tfidf_corpus_file = os.path.join(outputs_dir, "wikipedia_tfidf.mm")

    # build TF-IDF, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(tfidf_model_file)

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(tfidf_corpus_file, tfidf[mm], progress_cnt=10000)

    return tfidf_model_file, tfidf_corpus_file

コード例 #12

0

ファイルを表示

def main():
    # path = os.path.join("../../outputs", "gpt2_generated.csv")
    path = os.path.join("../../outputs", "gpt2_with_prompt.csv")
    length = 0

    # dct = Dictionary(common_texts)

    # model_path = os.path.join(os.getcwd(), "..", "lda_model", "model")
    # lda = models.ldamodel.LdaModel.load(model_path)

    lda = models.ldamodel.LdaModel.load(os.path.join("../lda_model", "model"))
    dct = Dictionary.load_from_text(os.path.join("../lda_model", "dictionary"))

    jsd_sum = 0.0

    with open(path, 'r') as file:
        csv_file = csv.DictReader(file)
        for row in csv_file:
            row = dict(row)
            text = row['generated']
            target_text = row['reference']
            text_vector = np.zeros(50)
            # text_tokenized = text.split()[:512]
            text_tokenized = text.split()[:80]
            text_processed = dct.doc2bow(text_tokenized)
            for elem in lda[text_processed]:
                text_vector[elem[0]] += elem[1]

            target_vector = np.zeros(50)
            # target_tokenized = target_text.split()[:512]
            target_tokenized = target_text.split()[:80]
            target_processed = dct.doc2bow(target_tokenized)
            for elem in lda[target_processed]:
                target_vector[elem[0]] += elem[1]
            length += 1

            jsd_sum += distance.jensenshannon(text_vector, target_vector)
    print(jsd_sum / length)

コード例 #13

0

ファイルを表示

ファイル: make_common_dict.py プロジェクト: wataru-masuda/text-base-demogra

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--use_domain", action="store_true")
    parser.add_argument("--update", action="store_true")
    parser.add_argument("--save_interval", type=int, default=100)
    args = parser.parse_args()

    if args.update:
        common_dict = Dictionary.load_from_text("./common_dict.txt")
    else:
        common_dict = Dictionary()
    for i, url in enumerate(sys.stdin):
        print("url " + str(i))
        text = fetch_contents_from_url(url.strip(), use_domain=args.use_domain)
        if not text:
            continue

        word_list = doc2word_list(text)
        common_dict.add_documents([word_list])

        if i % args.save_interval == args.save_interval - 1:
            common_dict.save_as_text("./common_dict.txt")

    common_dict.save_as_text("./common_dict.txt")

コード例 #14

0

ファイルを表示

ファイル: generate_embeddings.py プロジェクト: EemeliSaari/DeepTrends

def main(model, weights, data_path, data_prefix, result_path, dictionary_path,
         size, window, min_count, batch_size, epochs, normalize):

    dictionary = None
    if data_path:
        corpora = initialize_corpora(data_path, data_prefix, dictionary_path,
                                     'token')
        dictionary = corpora.dictionary
    elif not data_path and dictionary_path:
        dictionary = Dictionary.load_from_text(dictionary_path)

    MAP = dict(word2vec=(Word2VecWrapper,
                         dict(weights=weights,
                              size=size,
                              window=window,
                              min_count=min_count,
                              normalize=normalize,
                              dictionary=dictionary,
                              batch_size=batch_size)), )

    model_class, params = MAP[model]
    model = model_class(**params)

    # Not training
    if weights and not data_path:
        vector_dict = model.vectors
    elif data_path and not weights:
        model.fit(corpora, epochs=epochs)
        vector_dict = model.transform(corpora)
    else:
        raise ValueError('Need to define ether data_path or weights.')

    vectors_path = os.path.join(result_path, str(model) + '.csv')
    pd.DataFrame(vector_dict).to_csv(vectors_path, index=False)

    print(f'Vectors stored to path: {vectors_path}')

コード例 #15

0

ファイルを表示

    # optional argv[3] = keep_words
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        #sys.exit(1)
    input, output = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE

    # build dictionary. only keep 100k most frequent words (out of total ~900k unique tokens)
    enron = EnronCorpus(input, keep_words=keep_words)

    # save dictionary and bag-of-words (term-document frequency matrix)
    enron.dictionary.save_as_text(output + '_wordids.txt')
    MmCorpus.serialize(output + '_bow.mm', enron, progress_cnt=10000)
    del enron

    # initialize corpus reader and word->id mapping
    id2token = Dictionary.load_from_text(output + '_wordids.txt')
    mm = MmCorpus(output + '_bow.mm')

    # build tfidf
    from gensim.models import TfidfModel
    tfidf = TfidfModel(mm, id2word=id2token, normalize=True)

    # save tfidf vectors in matrix market format
    MmCorpus.serialize(output + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)

コード例 #16

0

ファイルを表示

ファイル: train.py プロジェクト: biddyweb/news-1

from news.document import *

if len(sys.argv) != 2:
    print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0])
    raise SystemExit(1)

data_dir = sys.argv[1]
mapping_file = data_dir + '/token_id_idf'
dictionary_file = data_dir + '/id_token_df'
token_file = data_dir + '/tokens'
lda_file = data_dir + '/lda_model'

print 'creating dictionary...'
N = 23307  # supplied idfs from rcv1/lyrl2004 were based on 23307 training docs
create_dictionary_file(mapping_file, dictionary_file, 23307)
dictionary = Dictionary.load_from_text(dictionary_file)

print 'creating corpus...'
corpus = SimpleLowCorpus(token_file, dictionary)

print 'training model...'
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
lda = LdaModel(corpus, id2word=dictionary, num_topics=200)
print 'done!'
print '\n' * 3
print '======final topics======'
topics = lda.show_topics(topics=-1, topn=4)
for i, topic in enumerate(topics):
    print i, topic

print 'saving model...'

コード例 #17

0

ファイルを表示

ファイル: Utils.py プロジェクト: seoyoungh/ko-chat-checker

# 사전 & 사후 spacing
def spacer(text):
    spacer = ChatSpace()
    result = spacer.space(text)
    return result


# spacing 단위로 tokenize
def tokenizer(text):
    return list(text.split(" "))


# 사전 로드
fn = pathlib.Path(__file__).parent / 'dictionary.txt'
loaded_dct = Dictionary.load_from_text(fn)


# 표기 오류가 있는 단어 indices, words return
def check_error(word_list):
    wrong_ids = []
    wrong_words = []
    pattern = re.compile('[ㄱ-ㅣa-zA-Z0-9]+')

    for i in range(len(word_list)):
        word = word_list[i]
        matched = pattern.search(word)
        hangul = re.compile('[^가-힣]+')
        word = hangul.sub('', word)  # 특수문자, 이모지 제거

        # 한글 자모, 영어, 숫자가 포함된 경우는 패스

コード例 #18

0

ファイルを表示

ファイル: wikicorpus.py プロジェクト: hjanime/gensim

    input, output = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE

    # build dictionary. only keep 100k most frequent words (out of total ~8.2m unique tokens)
    # takes about 9h on a macbook pro, for 3.5m articles (june 2011 wiki dump)
    wiki = WikiCorpus(input, keep_words=keep_words)
    # save dictionary and bag-of-words (term-document frequency matrix)
    # another ~9h
    wiki.dictionary.save_as_text(output + "_wordids.txt")
    MmCorpus.serialize(output + "_bow.mm", wiki, progress_cnt=10000)
    del wiki

    # initialize corpus reader and word->id mapping
    id2token = Dictionary.load_from_text(output + "_wordids.txt")
    mm = MmCorpus(output + "_bow.mm")

    # build tfidf,
    # ~30min
    from gensim.models import TfidfModel

    tfidf = TfidfModel(mm, id2word=id2token, normalize=True)

    # save tfidf vectors in matrix market format
    # ~2h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(output + "_tfidf.mm", tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)

コード例 #19

0

ファイルを表示

ファイル: split_corpus.py プロジェクト: syhw/contextual_word_segmentation

    fname = sys.argv[1]
    prefix = fname.split('/')[0]
    if len(sys.argv) > 2 and sys.argv[2][0:2] != '--':
        suffix = sys.argv[2]

    lemmatizer, filter_words = parse_args(sys.argv)
    if lemmatizer == None:
        LEMMATIZE = False
        suffix = '_tokenized_tfidf'
    else:
        suffix = '_lemmatized_tfidf'

    lda = None
    with open(prefix + suffix + '.ldamodel') as f:
        lda = cPickle.load(f)
    id2token = Dictionary.load_from_text(prefix + suffix + '_wordids.txt')

    if DEBUG:
        print "prefix:", prefix
        print "suffix:", suffix
        print "using dict:", prefix + suffix + '_wordids.txt'
        print id2token

    docs = []
    with open(fname) as f:
        print("splitting %s" % fname)
        tmp = []
        for line in f: # bufferize into docs list
            if line[0] == '@':
                docs.append(tmp)
                tmp = [line]

コード例 #20

0

ファイルを表示

ファイル: train.py プロジェクト: biddyweb/news-1

from news.document import *

if len(sys.argv) != 2:
    print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0])
    raise SystemExit(1)

data_dir = sys.argv[1]
mapping_file = data_dir+'/token_id_idf'
dictionary_file = data_dir+'/id_token_df'
token_file = data_dir+'/tokens'
lda_file = data_dir+'/lda_model'

print 'creating dictionary...'
N = 23307  # supplied idfs from rcv1/lyrl2004 were based on 23307 training docs
create_dictionary_file(mapping_file,dictionary_file,23307)
dictionary = Dictionary.load_from_text(dictionary_file)

print 'creating corpus...'
corpus = SimpleLowCorpus(token_file,dictionary)

print 'training model...'
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
lda = LdaModel(corpus,id2word=dictionary,num_topics=200)
print 'done!'
print '\n'*3
print '======final topics======'
topics = lda.show_topics(topics=-1,topn=4)
for i,topic in enumerate(topics):
    print i,topic

print 'saving model...'

コード例 #21

0

ファイルを表示

ファイル: lda_run.py プロジェクト: CristianCristanchoT/TEMIS-AI

 def __init__(self):
     self.cc_dict = Dictionary.load_from_text(LDA_DICT_PATH)
     self.tfidf = tfidfmodel.TfidfModel.load(LDA_TFIDF_PATH)
     self.lda = LdaMulticore.load(LDA_MODEL_PATH)

コード例 #22

0

ファイルを表示

    try:
        quote_identifier = config.get('quote', 'quote').lower()
        text_identifier = config.get('quote', 'text').lower()
        quote_identifiers = (quote_identifier, text_identifier)
    except (ConfigParser.NoSectionError, ConfigParser.NoOptionError):
        print("The configuration file needs to contain a [quote]"
              " section with 'quote' and 'text' options with the"
              " keywords in a Quote template")
        sys.exit(1)

    logger.info('finished loading configuration information')
    logger.info('loading dictionary mappings, this can take up to several'
                ' minutes')

    dictionary = Dictionary.load_from_text(wordids)

    logger.info('done loading dictionary mappings')
    logger.info('searching for unsubstantiated claims')

    finderWiki = FinderWikiCorpus(inp, dictionary, article_count, set_citation,
                                  quote_identifiers)
    article_claims = finderWiki.get_claims()
    base_url = finderWiki.base_url

    logger.info('done searching for unsubstantiated claims')

    logger.info('searching for probable sources, this can take a while')
    result = _get_response_data(article_claims, config_file)
    logger.info('done searching for probable sources')

コード例 #23

0

ファイルを表示

ファイル: serializers.py プロジェクト: jonathandunn/text_analytics

 def deserialize(self):
     temp_file = get_tmpfile('lda_dict_deserialize_tmp')
     with open(temp_file, 'w') as te:
         te.write(self.obj['corpus'])
     return Dictionary.load_from_text(temp_file)

コード例 #24

0

ファイルを表示

ファイル: prepare_corpus_tfidf.py プロジェクト: syhw/contextual_word_segmentation

        else:
            LEMMATIZE = False

    if not LEMMATIZE:
        ONLY_NOUN_VERBS = False
        ONLY_NOUNS = False

    if LEMMATIZE:
        print "we will lemmatize ('you were'->'be/VB')"
        mname = prefix + '_lemmatized_tfidf'
    else:
        print "you don't have pattern: we will tokenize ('you were'->'you','were')"
        mname = prefix + '_tokenized_tfidf'

    try:
        id2token = Dictionary.load_from_text(mname + '_wordids.txt')
        mm = MmCorpus(mname + '_bow.mm')
        print ">>> Loaded corpus from serialized files"
    except:
        print ">>> Extracting articles..."
        corpus = CDS_Corpus(FOLDER)
        corpus.dictionary.save_as_text(mname + '_wordids.txt')
        print ">>> Saved dictionary as " + mname + "_wordids.txt"
        MmCorpus.serialize(mname + '_bow.mm', corpus, progress_cnt=1000)
        print ">>> Saved MM corpus as " + mname + "_bow.mm"
        id2token = Dictionary.load_from_text(mname + '_wordids.txt')
        mm = MmCorpus(mname + '_bow.mm')
        del corpus

    print ">>> Using TF-IDF"
    tfidf = models.TfidfModel(mm, id2word=id2token, normalize=True)

コード例 #25

0

ファイルを表示

ファイル: TopicComposition.py プロジェクト: dbis-trier-university/Semantometrics

# read in
with open(get_survey_u(), encoding='latin-1') as s:
    survey_hlp = json.load(s)
    survey_hlp = survey_hlp['survey']

with open(get_seminal_u(), encoding='latin-1') as s:
    seminal_hlp = json.load(s)
    seminal_hlp = seminal_hlp['seminal']

with open(get_uninfluential_u(), encoding='latin-1') as s:
    uninfluential_hlp = json.load(s)
    uninfluential_hlp = uninfluential_hlp['uninfluential']

lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_unstemmed')
dictionary = Dictionary.load_from_text(get_file_base() +
                                       'lda_data/dict_unstemmed')

sem = []
sur = []
uni = []
for p in seminal_hlp:
    sem.append(lda[dictionary.doc2bow(p['abs'].split())])
for p in survey_hlp:
    sur.append(lda[dictionary.doc2bow(p['abs'].split())])
for p in uninfluential_hlp:
    uni.append(lda[dictionary.doc2bow(p['abs'].split())])

fin_sem = []
fin_sur = []
fin_uni = []

コード例 #26

0

ファイルを表示

ファイル: graph_tools.py プロジェクト: cosbynator/karma-prediction-cs224w

def run_lda(corpus_file, dictionary_path, topics=10):
    id2word = Dictionary.load_from_text(dictionary_path)
    mm = MmCorpus(corpus_file)
    print mm
    lda = LdaModel(corpus=mm, id2word=id2word, num_topics=topics)
    return lda