def make_lda(stem): ps = PorterStemmer() if stem: lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_stemmed') dictionary = Dictionary.load_from_text(get_file_base() + 'lda_data/dict_stemmed') sem_raw, sem_in, sem_out = read_in(get_seminal_s(), 'seminal') sur_raw, sur_in, sur_out = read_in(get_survey_s(), 'survey') uni_raw, uni_in, uni_out = read_in(get_uninfluential_s(), 'uninfluential') else: lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_unstemmed') dictionary = Dictionary.load_from_text(get_file_base() + 'lda_data/dict_unstemmed') sem_raw, sem_in, sem_out = read_in(get_seminal_u(), 'seminal') sur_raw, sur_in, sur_out = read_in(get_survey_u(), 'survey') uni_raw, uni_in, uni_out = read_in(get_uninfluential_u(), 'uninfluential') # write lda information to file if stem: write_to_file( get_file_base() + 'lda_data/sem_lda_stemmed.json', get_file_base() + 'lda_data/sem_lda_stemmed_one_doc_rep.json', sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary) write_to_file( get_file_base() + 'lda_data/sur_lda_stemmed.json', get_file_base() + 'lda_data/sur_lda_stemmed_one_doc_rep.json', sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary) write_to_file( get_file_base() + 'lda_data/uni_lda_stemmed.json', get_file_base() + 'lda_data/uni_lda_stemmed_one_doc_rep.json', uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary) else: write_to_file( get_file_base() + 'lda_data/sem_lda_unstemmed.json', get_file_base() + 'lda_data/sem_lda_unstemmed_one_doc_rep.json', sem_raw, sem_in, sem_out, 'seminal', '0', lda, dictionary) write_to_file( get_file_base() + 'lda_data/sur_lda_unstemmed.json', get_file_base() + 'lda_data/sur_lda_unstemmed_one_doc_rep.json', sur_raw, sur_in, sur_out, 'survey', '1', lda, dictionary) write_to_file( get_file_base() + 'lda_data/uni_lda_unstemmed.json', get_file_base() + 'lda_data/uni_lda_unstemmed_one_doc_rep.json', uni_raw, uni_in, uni_out, 'uninfluential', '2', lda, dictionary)
def user_lda(lda, dictionary_path, textyielder): id2word = Dictionary.load_from_text(dictionary_path) ret = {} for user, text in text_yielder(): bow = id2word.doc2bow(UserCorpus.text2tokens(text)) ret[user] = lda[bow] return ret
def __post_init__(self) -> None: pairs: pd.DataFrame = self.load_dataset_file(self.dataset_file) dct = Dictionary.load_from_text(("data/processed/dictionary.txt")) self.phrase_a = self.preprocess_phrase(pairs["phrase_a"], dct) self.phrase_b = self.preprocess_phrase(pairs["phrase_b"], dct)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--use_domain", action="store_true") parser.add_argument("--update", action="store_true") args = parser.parse_args() common_dict = Dictionary.load_from_text("./common_dict.txt") f = open("url2bow_map.csv", "a") for i, url in enumerate(sys.stdin): print("url " + str(i)) text = fetch_contents_from_url(url.strip(), use_domain=args.use_domain) if not text: continue word_list = doc2word_list(text) bow = common_dict.doc2bow(word_list) if bow: print(bow) for b in bow: f.write(url.strip() + "," + str(b[0]) + "," + str(b[1]) + "\n") if i % 100 == 99: f.close() f = open("url2bow_map.csv", "a") f.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data-dir', default='./data/test_arxiv_plain.txt', help='Path to directory where the data is stored') parser.add_argument('--model-dir', default='../model', help='Path to directory where the model is stored') parser.add_argument('--train', default=True, help='True for train, False for test mode') parser.add_argument('--n_topic', default=20, help='Number of of topics') args = parser.parse_args() model_dir = './model/model' dict_dir = './model/dict.txt' if args.train == True: print('Reading texts') with open(args.data_dir) as f_in: texts = f_in.read().split('\n') del texts[-1] for i in tqdm(range(len(texts))): texts[i] = texts[i].split() print('Generating corpora') dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] dictionary.save_as_text(dict_dir) print('Loading model') lda = LdaModel(corpus, num_topics=args.n_topic) lda.save(model_dir) else: lda = LdaModel.load(model_dir, mmap='r') dictionary = Dictionary() dictionary.load_from_text(dict_dir) print('Processing results') topics = lda.print_topics() with open('./report.txt', 'w') as f_out: for topic_id, topic_pair in topics: print(topic_id, end=': ', file=f_out) topic_words = topic_pair.split('"')[1::2] topic_words = list(map(int, topic_words)) topic_words = [dictionary.get(word) for word in topic_words] print(topic_words, file=f_out)
def SNAP_id2word(self): path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'snap_data', 'gensim_snap_dict.txt' ) # self.myLoadFromText(path) ret = Dictionary.load_from_text(path) return ret
def load(self): """ load the corpora created by `make_corpus.py` """ self.corpus = MmCorpus(self.corpus_file) self.dictionary = Dictionary.load_from_text(self.dict_file) self.titles = load_titles(self.title_file) self.tfidf_model = TfidfModel.load(self.tfidf_model_file) self.index = MatrixSimilarity(self.tfidf_model[self.corpus])
def saveLDACorpus(train_data_path,test_data_path,model_file,dictionary_file,corpus_file): "" lda = LdaModel.load(model_file) dictionary = Dictionary.load_from_text(dictionary_file) dictionary.id2token = utils.revdict(dictionary.token2id) src_df = pd.read_csv(corpus_file) src_df = parallelize(src_df, data_fram_proc1,dictionary ,lda) #计入ida特征 train_data, test_data = train_test_split(src_df[['label','multiLabels','item']], test_size=0.2, random_state=42) train_data.to_csv(train_data_path, index=None )#, header=None test_data.to_csv(test_data_path, index=None )#, header=None
def __init__(self, data_path: str, prefix: str = None, iterator: str = 'token', parsing: str = 'simple', word_up_limit: float = 0.75, word_low_limit: int = 20, dictionary: str = None, shuffle: bool = False, seed: int = 42, document_minimum_length: int = 5, stopwords: str = None): iter_map = dict(token=self.tokenize, bow=self.bowize, sentences=self.sentences) self.iterator = iter_map[iterator] self.word_low_limit = word_low_limit self.word_up_limit = word_up_limit if stopwords: self.stopwords = [w.strip() for w in open(stopwords).readlines()] else: self.stopwords = [] if not dictionary: self.dictionary = Dictionary() else: self.dictionary = Dictionary.load_from_text(dictionary) if self.stopwords: self.dictionary.filter_tokens( bad_ids=self.dictionary.doc2idx(self.stopwords)) self.is_built = True self.shuffle = shuffle if self.shuffle: np.random.seed(seed) self.document_minimum_length = document_minimum_length corpus = self.init_corpus(data_path, prefix, parsing) super(Corpora, self).__init__(corpus=corpus)
def main(): try: dictionary = Dictionary.load_from_text("dictionary.txt") except: dictionary = Dictionary(rcv1_train) dictionary.filter_extremes() dictionary.save_as_text("dictionary.txt") class RCV1BowCorpus(object): def __iter__(self): for document in rcv1_train: yield dictionary.doc2bow(document) ln.debug("Training model on %s documents" % len(rcv1_train)) try: vector_model = LsiModel.load("lsi_model") except: vector_model = LsiModel(corpus=RCV1BowCorpus(), num_topics=100, id2word=dictionary) vector_model.save("lsi_model") def get_lsi_features(text): """ Must return either numpy array or dictionary """ res = vector_model[dictionary.doc2bow(text)] return dict(res) def get_bow_features(text): return dict(dictionary.doc2bow(text)) clf = train_classifier(train_samples=rcv1_train, train_targets=rcv1_train_target, get_features=get_lsi_features, classifier="sgd") evaluate_classifier(clf, rcv1_test, rcv1_test_target, get_features=get_lsi_features)
def create_tfidf_corpus(corpus_file, dict_file, outputs_dir): # Load back the id->word mapping directly from file # This seems to save more memory, compared to keeping the # wiki.dictionary object from above dictionary = Dictionary.load_from_text(dict_file) # initialize corpus reader and word->id mapping mm = MmCorpus(corpus_file) tfidf_model_file = os.path.join(outputs_dir, "wikipedia.tfidf_model") tfidf_corpus_file = os.path.join(outputs_dir, "wikipedia_tfidf.mm") # build TF-IDF, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) tfidf.save(tfidf_model_file) # save tfidf vectors in matrix market format # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(tfidf_corpus_file, tfidf[mm], progress_cnt=10000) return tfidf_model_file, tfidf_corpus_file
def main(): # path = os.path.join("../../outputs", "gpt2_generated.csv") path = os.path.join("../../outputs", "gpt2_with_prompt.csv") length = 0 # dct = Dictionary(common_texts) # model_path = os.path.join(os.getcwd(), "..", "lda_model", "model") # lda = models.ldamodel.LdaModel.load(model_path) lda = models.ldamodel.LdaModel.load(os.path.join("../lda_model", "model")) dct = Dictionary.load_from_text(os.path.join("../lda_model", "dictionary")) jsd_sum = 0.0 with open(path, 'r') as file: csv_file = csv.DictReader(file) for row in csv_file: row = dict(row) text = row['generated'] target_text = row['reference'] text_vector = np.zeros(50) # text_tokenized = text.split()[:512] text_tokenized = text.split()[:80] text_processed = dct.doc2bow(text_tokenized) for elem in lda[text_processed]: text_vector[elem[0]] += elem[1] target_vector = np.zeros(50) # target_tokenized = target_text.split()[:512] target_tokenized = target_text.split()[:80] target_processed = dct.doc2bow(target_tokenized) for elem in lda[target_processed]: target_vector[elem[0]] += elem[1] length += 1 jsd_sum += distance.jensenshannon(text_vector, target_vector) print(jsd_sum / length)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--use_domain", action="store_true") parser.add_argument("--update", action="store_true") parser.add_argument("--save_interval", type=int, default=100) args = parser.parse_args() if args.update: common_dict = Dictionary.load_from_text("./common_dict.txt") else: common_dict = Dictionary() for i, url in enumerate(sys.stdin): print("url " + str(i)) text = fetch_contents_from_url(url.strip(), use_domain=args.use_domain) if not text: continue word_list = doc2word_list(text) common_dict.add_documents([word_list]) if i % args.save_interval == args.save_interval - 1: common_dict.save_as_text("./common_dict.txt") common_dict.save_as_text("./common_dict.txt")
def main(model, weights, data_path, data_prefix, result_path, dictionary_path, size, window, min_count, batch_size, epochs, normalize): dictionary = None if data_path: corpora = initialize_corpora(data_path, data_prefix, dictionary_path, 'token') dictionary = corpora.dictionary elif not data_path and dictionary_path: dictionary = Dictionary.load_from_text(dictionary_path) MAP = dict(word2vec=(Word2VecWrapper, dict(weights=weights, size=size, window=window, min_count=min_count, normalize=normalize, dictionary=dictionary, batch_size=batch_size)), ) model_class, params = MAP[model] model = model_class(**params) # Not training if weights and not data_path: vector_dict = model.vectors elif data_path and not weights: model.fit(corpora, epochs=epochs) vector_dict = model.transform(corpora) else: raise ValueError('Need to define ether data_path or weights.') vectors_path = os.path.join(result_path, str(model) + '.csv') pd.DataFrame(vector_dict).to_csv(vectors_path, index=False) print(f'Vectors stored to path: {vectors_path}')
# optional argv[3] = keep_words if len(sys.argv) < 3: print globals()['__doc__'] % locals() #sys.exit(1) input, output = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE # build dictionary. only keep 100k most frequent words (out of total ~900k unique tokens) enron = EnronCorpus(input, keep_words=keep_words) # save dictionary and bag-of-words (term-document frequency matrix) enron.dictionary.save_as_text(output + '_wordids.txt') MmCorpus.serialize(output + '_bow.mm', enron, progress_cnt=10000) del enron # initialize corpus reader and word->id mapping id2token = Dictionary.load_from_text(output + '_wordids.txt') mm = MmCorpus(output + '_bow.mm') # build tfidf from gensim.models import TfidfModel tfidf = TfidfModel(mm, id2word=id2token, normalize=True) # save tfidf vectors in matrix market format MmCorpus.serialize(output + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
from news.document import * if len(sys.argv) != 2: print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0]) raise SystemExit(1) data_dir = sys.argv[1] mapping_file = data_dir + '/token_id_idf' dictionary_file = data_dir + '/id_token_df' token_file = data_dir + '/tokens' lda_file = data_dir + '/lda_model' print 'creating dictionary...' N = 23307 # supplied idfs from rcv1/lyrl2004 were based on 23307 training docs create_dictionary_file(mapping_file, dictionary_file, 23307) dictionary = Dictionary.load_from_text(dictionary_file) print 'creating corpus...' corpus = SimpleLowCorpus(token_file, dictionary) print 'training model...' logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) lda = LdaModel(corpus, id2word=dictionary, num_topics=200) print 'done!' print '\n' * 3 print '======final topics======' topics = lda.show_topics(topics=-1, topn=4) for i, topic in enumerate(topics): print i, topic print 'saving model...'
# 사전 & 사후 spacing def spacer(text): spacer = ChatSpace() result = spacer.space(text) return result # spacing 단위로 tokenize def tokenizer(text): return list(text.split(" ")) # 사전 로드 fn = pathlib.Path(__file__).parent / 'dictionary.txt' loaded_dct = Dictionary.load_from_text(fn) # 표기 오류가 있는 단어 indices, words return def check_error(word_list): wrong_ids = [] wrong_words = [] pattern = re.compile('[ㄱ-ㅣa-zA-Z0-9]+') for i in range(len(word_list)): word = word_list[i] matched = pattern.search(word) hangul = re.compile('[^가-힣]+') word = hangul.sub('', word) # 특수문자, 이모지 제거 # 한글 자모, 영어, 숫자가 포함된 경우는 패스
input, output = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE # build dictionary. only keep 100k most frequent words (out of total ~8.2m unique tokens) # takes about 9h on a macbook pro, for 3.5m articles (june 2011 wiki dump) wiki = WikiCorpus(input, keep_words=keep_words) # save dictionary and bag-of-words (term-document frequency matrix) # another ~9h wiki.dictionary.save_as_text(output + "_wordids.txt") MmCorpus.serialize(output + "_bow.mm", wiki, progress_cnt=10000) del wiki # initialize corpus reader and word->id mapping id2token = Dictionary.load_from_text(output + "_wordids.txt") mm = MmCorpus(output + "_bow.mm") # build tfidf, # ~30min from gensim.models import TfidfModel tfidf = TfidfModel(mm, id2word=id2token, normalize=True) # save tfidf vectors in matrix market format # ~2h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(output + "_tfidf.mm", tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
fname = sys.argv[1] prefix = fname.split('/')[0] if len(sys.argv) > 2 and sys.argv[2][0:2] != '--': suffix = sys.argv[2] lemmatizer, filter_words = parse_args(sys.argv) if lemmatizer == None: LEMMATIZE = False suffix = '_tokenized_tfidf' else: suffix = '_lemmatized_tfidf' lda = None with open(prefix + suffix + '.ldamodel') as f: lda = cPickle.load(f) id2token = Dictionary.load_from_text(prefix + suffix + '_wordids.txt') if DEBUG: print "prefix:", prefix print "suffix:", suffix print "using dict:", prefix + suffix + '_wordids.txt' print id2token docs = [] with open(fname) as f: print("splitting %s" % fname) tmp = [] for line in f: # bufferize into docs list if line[0] == '@': docs.append(tmp) tmp = [line]
from news.document import * if len(sys.argv) != 2: print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0]) raise SystemExit(1) data_dir = sys.argv[1] mapping_file = data_dir+'/token_id_idf' dictionary_file = data_dir+'/id_token_df' token_file = data_dir+'/tokens' lda_file = data_dir+'/lda_model' print 'creating dictionary...' N = 23307 # supplied idfs from rcv1/lyrl2004 were based on 23307 training docs create_dictionary_file(mapping_file,dictionary_file,23307) dictionary = Dictionary.load_from_text(dictionary_file) print 'creating corpus...' corpus = SimpleLowCorpus(token_file,dictionary) print 'training model...' logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) lda = LdaModel(corpus,id2word=dictionary,num_topics=200) print 'done!' print '\n'*3 print '======final topics======' topics = lda.show_topics(topics=-1,topn=4) for i,topic in enumerate(topics): print i,topic print 'saving model...'
def __init__(self): self.cc_dict = Dictionary.load_from_text(LDA_DICT_PATH) self.tfidf = tfidfmodel.TfidfModel.load(LDA_TFIDF_PATH) self.lda = LdaMulticore.load(LDA_MODEL_PATH)
try: quote_identifier = config.get('quote', 'quote').lower() text_identifier = config.get('quote', 'text').lower() quote_identifiers = (quote_identifier, text_identifier) except (ConfigParser.NoSectionError, ConfigParser.NoOptionError): print("The configuration file needs to contain a [quote]" " section with 'quote' and 'text' options with the" " keywords in a Quote template") sys.exit(1) logger.info('finished loading configuration information') logger.info('loading dictionary mappings, this can take up to several' ' minutes') dictionary = Dictionary.load_from_text(wordids) logger.info('done loading dictionary mappings') logger.info('searching for unsubstantiated claims') finderWiki = FinderWikiCorpus(inp, dictionary, article_count, set_citation, quote_identifiers) article_claims = finderWiki.get_claims() base_url = finderWiki.base_url logger.info('done searching for unsubstantiated claims') logger.info('searching for probable sources, this can take a while') result = _get_response_data(article_claims, config_file) logger.info('done searching for probable sources')
def deserialize(self): temp_file = get_tmpfile('lda_dict_deserialize_tmp') with open(temp_file, 'w') as te: te.write(self.obj['corpus']) return Dictionary.load_from_text(temp_file)
else: LEMMATIZE = False if not LEMMATIZE: ONLY_NOUN_VERBS = False ONLY_NOUNS = False if LEMMATIZE: print "we will lemmatize ('you were'->'be/VB')" mname = prefix + '_lemmatized_tfidf' else: print "you don't have pattern: we will tokenize ('you were'->'you','were')" mname = prefix + '_tokenized_tfidf' try: id2token = Dictionary.load_from_text(mname + '_wordids.txt') mm = MmCorpus(mname + '_bow.mm') print ">>> Loaded corpus from serialized files" except: print ">>> Extracting articles..." corpus = CDS_Corpus(FOLDER) corpus.dictionary.save_as_text(mname + '_wordids.txt') print ">>> Saved dictionary as " + mname + "_wordids.txt" MmCorpus.serialize(mname + '_bow.mm', corpus, progress_cnt=1000) print ">>> Saved MM corpus as " + mname + "_bow.mm" id2token = Dictionary.load_from_text(mname + '_wordids.txt') mm = MmCorpus(mname + '_bow.mm') del corpus print ">>> Using TF-IDF" tfidf = models.TfidfModel(mm, id2word=id2token, normalize=True)
# read in with open(get_survey_u(), encoding='latin-1') as s: survey_hlp = json.load(s) survey_hlp = survey_hlp['survey'] with open(get_seminal_u(), encoding='latin-1') as s: seminal_hlp = json.load(s) seminal_hlp = seminal_hlp['seminal'] with open(get_uninfluential_u(), encoding='latin-1') as s: uninfluential_hlp = json.load(s) uninfluential_hlp = uninfluential_hlp['uninfluential'] lda = ldamodel.LdaModel.load(get_file_base() + 'lda_data/lda_model_unstemmed') dictionary = Dictionary.load_from_text(get_file_base() + 'lda_data/dict_unstemmed') sem = [] sur = [] uni = [] for p in seminal_hlp: sem.append(lda[dictionary.doc2bow(p['abs'].split())]) for p in survey_hlp: sur.append(lda[dictionary.doc2bow(p['abs'].split())]) for p in uninfluential_hlp: uni.append(lda[dictionary.doc2bow(p['abs'].split())]) fin_sem = [] fin_sur = [] fin_uni = []
def run_lda(corpus_file, dictionary_path, topics=10): id2word = Dictionary.load_from_text(dictionary_path) mm = MmCorpus(corpus_file) print mm lda = LdaModel(corpus=mm, id2word=id2word, num_topics=topics) return lda