'project_subject_subcategories': str, 'project_title': str, 'project_essay_1': str, 'project_essay_2': str, 'project_essay_3': str, 'project_essay_4': str, 'project_resource_summary': str, 'teacher_number_of_previously_posted_projects': int, 'project_is_approved': np.uint8, } # Read data and store in DataFrame. train_data = pd.read_csv(train_file_path, sep=',', dtype=dtype, low_memory=True).sample(10000) essay1 = train_data['project_essay_1'] ids = train_data['id'] ess1_list = [] for index, row in train_data.iterrows(): ess1_list.append( LabeledSentence(row['project_essay_1'].split(" "), [row['id']])) #size is the vector length, window means how many words are included in one paragraph model = models.Doc2Vec(size=100, window=200, min_count=3, workers=1) vocab = model.build_vocab(ess1_list) model.train(ess1_list, epochs=10, total_words=100) model.save("ess1_model.doc2vec") # model_loaded = models.Doc2Vec.load('ess1_model.doc2vec') # print "the first vector is: " # print model.docvecs[0]
self.labels = labels def __iter__(self): for i, words in enumerate(self.words_list): yield models.doc2vec.LabeledSentence(words, ['%s' % self.labels[i]]) # ラベル付けを行う morph_list, docs = set_folder_morph(corpus + folder) sentences = LabeledListSentence(morph_list, docs) # doc2vec の学習条件設定 # alpha: 学習率 / min_count: X回未満しか出てこない単語は無視 # size: ベクトルの次元数 / iter: 反復回数 / workers: 並列実行数 model = models.Doc2Vec(alpha=0.025, min_count=5, size=100, iter=20, workers=4) # doc2vec の学習前準備(単語リスト構築) model.build_vocab(sentences) # Wikipedia から学習させた単語ベクトルを無理やり適用して利用することも出来ます # model.intersect_word2vec_format('./data/wiki/wiki2vec.bin', binary=True) print("training...") # 学習実行 model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) if not os.path.isdir("./model"): os.mkdir("./model") if not os.path.isdir("./model/doc2vec"): os.mkdir("./model/doc2vec")
pretrained_emb = "toy_data/sg.word2vec.300d" #None if use without pretrained embeddings #input corpus train_corpus = "toy_data/train_docs.txt" #output model saved_path = "toy_data/model.bin" #enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #train doc2vec model docs = g.doc2vec.TaggedLineDocument(train_corpus) model = g.Doc2Vec(docs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, pretrained_emb=pretrained_emb, iter=train_epoch) #save model model.save(saved_path)
data = pd.read_csv('./labeledTrainData.tsv', sep='\t') stop_words = [" "] sentences = [] for index in data.index: id = data.loc[index]['id'] sentiment = data.loc[index]['sentiment'] review = data.loc[index]['review'] #去掉HTML标签 review = re.sub(r'<.*?>', '', review) review_list = [ w.strip('.,?!\\"\'') for w in review.split(' ') if w not in stop_words ] sentences.append(models.doc2vec.TaggedDocument(review_list, [id])) #转换为向量 doc2vec = models.Doc2Vec(sentences) #取30% 作为测试数据 total = len(data.index) testIndexs = random.sample(range(total), int(total * 0.3)) trainData = [] trainLabel = [] testData = [] testLabel = [] for index in range(total): id = data.loc[index]['id'] if index in testIndexs: testData.append(doc2vec.docvecs[id]) testLabel.append(data.loc[index]['sentiment']) else:
for sent in self.lists: self.j =self.j +1 yield gensim.models.doc2vec.LabeledSentence(words= sent, tags=["sent_"+ str(self.j)]) path = "/home/raksha/FIRE-2016/CHIS_testSet/final_fire_test_data_xls/skincancer.xlsx" book = xlrd.open_workbook(path) first_sheet = book.sheet_by_index(0) print first_sheet.nrows for i in range(1,first_sheet.nrows): #print first_sheet.row_values(i) cell = first_sheet.cell(i,0) sent= cell.value.split() sentences.append(sent) it = LabeledLineSentence(sentences) #contains one file of all appended doc for that categoty doc2vecmodel = models.Doc2Vec(it,size = 200, window = 5, min_count = 0, dm = 0) index2wordcollection = doc2vecmodel.index2word env = lmdb.open('wikipedia-pubmed-and-PMC-w2v') txn = env.begin(buffers=True) wordvector=[] for i in range(len(doc2vecmodel.syn0)): #pdb.set_trace() #if index2wordcollection[i].startswith("SENT_"): # continue word = index2wordcollection[i] try: word = index2wordcollection[i] text = word.encode('UTF-8')
for idx, (doc, name) in enumerate(zip(docs, corpus)): sys.stdout.write('\r前処理中 {}/{}'.format(idx, len(corpus))) yield doc_to_sentence(doc, name) corpus = corpus_files() sentences = corpus_to_sentences(corpus) if not os.path.exists(MODEL_DIR): os.mkdir(MODEL_DIR) if isfile(PRE_TRAIN_MODEL_PATH): print('訓練済みモデルを使用します') model = models.Doc2Vec.load(PRE_TRAIN_MODEL_PATH) else: model = models.Doc2Vec(dm=0, size=300, window=15, alpha=.025, min_alpha=.025, min_count=1, sample=1e-6 ) model.build_vocab(sentences) print('\n訓練開始') for epoch in range(20): print('Epoch: {}'.format(epoch + 1)) model.train(sentences, total_examples=model.corpus_count, epochs=1) model.alpha -= (0.025 - 0.0001) / 19 model.min_alpha = model.alpha model.save(PRE_TRAIN_MODEL_PATH) predict_file = './text/livedoor-homme/livedoor-homme-5625149.txt' print('類似度検索対象 : ' + predict_file) predict_text = read_document(predict_file) predict_results = model.docvecs.most_similar([model.infer_vector(split_into_words(predict_text))], topn=5) for result in predict_results:
# レポートの各行から、動詞・形容詞・名詞(数を除く)の情報のみを取り出す if len(chunks) > 3 and ( chunks[3].startswith('動詞') or chunks[3].startswith('形容詞') or (chunks[3].startswith('名詞') and not chunks[3].startswith('名詞-数'))): words.append(chunks[0]) sentences.append(TaggedDocument(words=words, tags=doc.tags)) # 学習モデルを生成 # size: ベクトル化した際の次元数 # alpha: 学習率 # sample:単語を無視する際の頻度の閾値 # min_count:学習に使う単語の最低出現回数 # workers:学習時のスレッド数 model = models.Doc2Vec(vector_size=400, alpha=0.0015, sample=1e-4, min_count=3, workers=4) # Doc2Vecに単語を登録 model.build_vocab(sentences) # 学習評価に使用するサンプル数と閾値を設定 # ※Python3の四捨五入は「最近接偶数への丸め」となっていることに注意(1.5も2.5も2に丸められる) sample_num = int(round(len(sentences) * SAMPLE_PROPORTION, 0)) passing_thres = int(round(sample_num * PASSING_PRECISION, 0)) print('sample_num:' + str(sample_num)) print('passing_thres:' + str(passing_thres)) endFlg = False
def train_doc_model(corpus, file): print("Training Doc2Vec model") model = models.Doc2Vec(corpus, size=100) model.save(file)
'it', 'doesn', 'an', 'as', 'itself', 'at', 'have', 'in', 'any', 'if', 'again', 'no', 'when', 'same', 'how', 'other', 'which', 'yo', 'shan', 'needn', 'haven', 'after', 'most', 'such', 'why', 'a', 'off', 'i', 'm', 'yours', 'so', 'y', 'the', 'having', 'once' ] sentences = [] for i in range(len(your_list)): words = your_list[i][6].lower().split() words = [word for word in words if word not in stops] sentence = models.doc2vec.LabeledSentence( words=words, tags=["SENT_" + str(your_list[i][0])]) sentences.append(sentence) model = models.Doc2Vec(size=300, window=20, alpha=.025, min_alpha=.025, min_count=2, workers=10) model.build_vocab(sentences) for epoch in range(12): model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.0018 # decrease the learning rate` model.min_alpha = model.alpha # fix the learning rate, no decay print('DONE WITH TRAINING') model.save('questiondupemodelA') search_phrase = [ 'what', 'code', 'analysis', 'tools', 'do', 'you', 'use', 'on', 'your',
docs.append(l[:-1].split(' ')) sentences = [] for i, title in enumerate(titles): sentences.append(models.doc2vec.LabeledSentence(docs[i], title)) #print(sentences) #model = models.Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, alpha=.025, min_alpha=.025, min_count=0) #model.build_vocab(sentences) #for epoch in range(100): # model.train(sentences) # model.alpha -= 0.002 # decrease the learning rate` # model.min_alpha = model.alpha # fix the learning rate, no decay model = models.Doc2Vec(sentences, dm=1, dm_mean=1, size=100, window=2, negative=5, min_count=0) model.save("my_model.doc2vec") model_loaded = models.Doc2Vec.load('my_model.doc2vec') #print(model.docvecs.most_similar(["SENT_1"])) #print(model_loaded.docvecs.most_similar(["SENT_2"])) print(model_loaded.docvecs.most_similar(["言語"]))
item_index = np.arange(0,len(images_df_up_using)*4) item_index = item_index%len(images_df_up_using) input_x = images_df_using.to_numpy() total_input = drop_none(total_input) #len(0) 삭제 use_input = [(line,i_idx) for line,i_idx in zip(total_input,item_index) if len(line)!=0] input_d2v_x = [line for line,idx in use_input] item_index = [idx for line,idx in use_input] using_att_idx=drop_none(images_df_using.to_numpy()) model = models.Doc2Vec(alpha=.025, min_alpha=.025, min_count=1) sentences = LabeledLineSentenceByAttribute(input_d2v_x,item_index,suffle=True) model.build_vocab(sentences) model = models.Doc2Vec( documents=sentences, min_count=1, size=50, window=1, iter=30, workers=10, #callbacks=[callback()] ) model.save("doc2vec_using_itemsplitidx_ustyle9") #ver2 : extend each item to att model_loaded = models.Doc2Vec.load('doc2vec_using_itemsplitidx_ustyle9')
#_*_coding:utf-8_*_ from gensim import models,corpora import jieba import codecs import logging from langconv import * #enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) zhwiki = '/Users/yangyang/Desktop/NLP/data/zhwiki-latest-pages-articles.xml.bz2' wiki = corpora.WikiCorpus(zhwiki,lemmatize=False,dictionary={}) ''' gensim LabeledSentence:将文本(分词)、标签一起训练,得到文本向量 ''' class TaggedWikiDocument(object): def __init__(self, wiki): self.wiki = wiki self.wiki.metadata = True def __iter__(self): for content, (page_id, title) in self.wiki.get_texts(): yield models.doc2vec.LabeledSentence(words=[w for c in content for w in jieba.cut(Converter('zh-hans').convert(c))], tags=[title]) documents = TaggedWikiDocument(wiki) model = models.Doc2Vec(documents,dm=0,window=8,dbow_words=1,size=192,min_alpha=19,iter=5,workers=6) model.save('./data/zhiwiki_news.doc2vec')
# print word.encode('utf-8') if (word not in stopwords and flag[0] in [u'n', u'f', u'a', u'z']): #去停用词和其他词性,比如非名词动词等 result += word.encode('utf-8') # +"/"+str(w.flag)+" " #去停用词 return result input = [] labels = [] uid = 1 for sentence in sentences: sentence = delNOTNeedWords(sentence, stopwords) input.append(jieba.lcut(sentence)) labels.append('SENT_%s' % uid) uid = uid + 1 documents = LabeledSentence(words=input, labels=labels) # bigram_transformer = models.Phrases(input) model = models.Doc2Vec(documents, size=feature_size, window=content_window, min_count=freq_min_count, negative=negative, iter=iter, workers=multiprocessing.cpu_count()) # print model.index2word model.save(save_filename) f = model.most_similar([u'奥迪']) for k in f: print k[0].encode('utf-8'), k[1]
LabeledSentence = gensim.models.doc2vec.LabeledSentence doc2vec_dir ="Data/doc2vec/not_trump" token_type = "zub_" #sentences = [] #with open(doc2vec_dir+token_type+"doc2vec_train_corpus.txt",'r')as corpfile: # sentences=[sent.split() for sent in corpfile.readlines()] with open(doc2vec_dir+token_type+"id_text_dic.json",'r')as corpfile: sent_dic = json.load(corpfile) sentences = [LabeledSentence(v.split(),[str(k)]) for k,v in sent_dic.items()] #sentences = models.doc2vec.TaggedLineDocument(doc2vec_dir+token_type+"doc2vec_train_corpus.txt")#yelp_data_small(words="sent_doc2vec", labels="label_doc2vec") model_zub = models.Doc2Vec(sentences, size=dims, window=8, min_count=0, workers=4) dims = str(dims) model_zub.save(doc2vec_dir+token_type+"rumorEval_doc2vec"+dims+".model") model_zub.init_sims(replace=True) model_zub.save(doc2vec_dir+token_type+"rumorEval_doc2vec_set"+dims+".model") dims =int(dims) token_type = "twit_" sentences = [] with open(doc2vec_dir+token_type+"id_text_dic.json",'r')as corpfile: sent_dic = json.load(corpfile) sentences = [LabeledSentence(v.split(),[str(k)]) for k,v in sent_dic.items()] #sentences = models.doc2vec.TaggedLineDocument(doc2vec_dir+token_type+"doc2vec_train_corpus.txt")#yelp_data_small(words="sent_doc2vec", labels="label_doc2vec") model_twit = models.Doc2Vec(sentences, size=dims, window=8, min_count=0, workers=4)
for train in trains: if train["html_url"].split("/")[-1] == comment["issue_url"].split( "/")[-1]: train["body"] = f"{train['body']} {comment['body']}" # add label terms = [ TaggedDocument(f"{train['title']} {train['body']}", [str(i)]) for i, train in enumerate(trains) ] # model train model = models.Doc2Vec(terms, dm=0, vector_size=100, window=2, min_count=0, workers=4, epoch=20) # model.save('doc2vec_model') model = Doc2Vec.load("doc2vec_model") # output results results = model.docvecs.most_similar(len(trains) - 1) suggestions = [] for result in results: index = int(result[0]) suggestion = {} suggestion["html_url"] = trains[index]["html_url"] suggestion["title"] = trains[index]["title"]
wakati_words.append(node.surface) elif hinshi in ["動詞", "形容詞"]: wakati_words.append(node.feature.split(",")[6]) node = node.next return wakati_words #作品リストをDoc2Vecが読めるTaggedDocument形式にし、配列に追加する --- (*5) documents = [] #作品リストをループで回す for auther, book in book_list(): #作品の文字列を取得 words = read_book(auther, book) #作品の文字列を分かち書きに wakati_words = split_words(words) #TaggedDocumentの作成 文書=分かち書きにした作品 タグ=作者:作品名 document = TaggedDocument(wakati_words, [auther["name"] + ":" + book["name"]]) documents.append(document) #TaggedDocumentの配列を使ってDoc2Vecの学習モデルを作成 --- (*6) model = models.Doc2Vec(documents, dm=0, vector_size=300, window=15, min_count=1) #Doc2Vecの学習モデルを保存 model.save('aozora.model') print("モデル作成完了")
def get_doc2vec_model(self, build_model): self.tagged_docs_pos_train, self.tagged_docs_neg_train = self.data_transformation.transform_sents( self.train_x, self.train_y, True) x = self.test_x if not self.is_real_test else self.test_x.values() self.tagged_docs_pos_test, self.tagged_docs_neg_test = self.data_transformation.transform_sents( x, self.test_y, False) if build_model or not os.path.exists(model_location + self.doc2vec_model_name): self.doc2vec_model = models.Doc2Vec(min_count=1, window=10, size=400, sample=1e-4, negative=5, workers=7) tagged_docs_train = self.tagged_docs_pos_train + self.tagged_docs_neg_train tagged_docs_test = self.tagged_docs_pos_test + self.tagged_docs_neg_test tagged_docs = tagged_docs_train + tagged_docs_test self.doc2vec_model.build_vocab(tagged_docs) shuffled = list(tagged_docs) random.shuffle(shuffled) self.doc2vec_model.train( shuffled, total_examples=self.doc2vec_model.corpus_count, epochs=self.iter) self.doc2vec_model.save(model_location + self.doc2vec_model_name) else: self.doc2vec_model = models.Doc2Vec.load(model_location + self.doc2vec_model_name) train_arrays, train_labels = self.data_transformation.create_classifier_arrays( self.doc2vec_model, True, len(self.tagged_docs_pos_train), len(self.tagged_docs_neg_train)) test_arrays, test_labels = self.data_transformation.create_classifier_arrays( self.doc2vec_model, False, len(self.tagged_docs_pos_test), len(self.tagged_docs_neg_test)) clf = LogisticRegression(penalty='l2') clf = SVC() clf.fit(train_arrays, train_labels) # C_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # gamma_range = [0.01, 0.02, 0.03, 0.04, 0.05, 0.10, 0.2, 0.3, 0.4, 0.5] # param_grid = dict(gamma=gamma_range, C=C_range) # cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42) # clf = RandomizedSearchCV(SVC(), param_distributions=param_grid, cv=cv, n_iter=5) # clf.fit(train_arrays, train_labels) # # print(clf.best_params_) # print(clf.best_estimator_) # print(clf.best_score_) logging.info("Finished training classifier.") # Approach when not training doc2vec on test reviews # tvecs = [] # # for i in range(len(self.test_x)): # tdt = TaggedDocument(self.remove_stopwords(self.test_x[i]), ["test_" + str(i)]) # tvecs.append(self.doc2vec_model.infer_vector(tdt.words, steps=200)) # # logging.info("Created TaggedDocuments for Training data.") # print(classifier.score(test_arrays, test_labels)) if self.is_real_test: file_ids = self.test_x.keys() pred = clf.predict(test_arrays) self.data_transformation.write_to_file(dict(zip(file_ids, pred)), "doc2vec") else: v = Visualization(test_labels, clf.predict(test_arrays), "doc2vec - Logistik Regression") v.generate()
def train_doc_model_manual(corpus, file): print("Training Dord2Vec model") model = models.Doc2Vec(dm=1, iter=5, alpha=0.1, min_alpha=0.025, size=100) model.build_vocab(corpus) model.train(corpus, total_examples=model.corpus_count, epochs=model.iter) model.save(file)
for r in result: all_text.append(r[1].strip().split(" ")) for o in original: all_text.append(o[1].strip().split(" ")) count = 0 doc = [] sentences = [] for i in range(len(all_text)): string = "DOC_" + str(i) sentence = models.doc2vec.LabeledSentence(all_text[i], labels=[string]) sentences.append(sentence) d2v = models.Doc2Vec(sentences, size=100, window=5, min_count=0, dm=1) #Doc2Vec train for j in range(5): d2v.train(sentences) features = [] for ii, term in enumerate(sentences): feature = [] string = "DOC_" + str(ii) for term in d2v[string]: feature.append(term) features.append(feature) candidate_text = features[:1000] original_text = features[1000:]
from gensim import models from gensim.models.doc2vec import TaggedLineDocument from os import path import argparse if __name__ == "__main__": parser = argparse.ArgumentParser(description="Train doc2vec.") parser.add_argument("fname", help="filename", type=str) parser.add_argument("model", help="modelname", type=str) args = parser.parse_args() sentences = TaggedLineDocument(args.fname) outname = args.model + ".doc2vec" if path.isfile(outname): model = models.Doc2Vec.load(outname) else: model = models.Doc2Vec(size=100, window=5, min_count=5, workers=4) model.build_vocab(sentences) model.train(sentences) model.save(outname) print ""
#sentence3 = models.doc2vec.LabeledSentence( # words=[u'魚', u'泳ぐ', u'海'], tags=["SENT_3"]) #sentences = [sentence, sentence1, sentence2, sentence3] #print sentences class LabeledLineSentence(object): def __init__(self, filename): self.filename = filename def __iter__(self): for uid, line in enumerate(open(filename)): yield LabeledSentence(words=line.split(), labels=['SENT_%s' % uid]) model = models.Doc2Vec(alpha=.025, min_alpha=.025, min_count=1) model.build_vocab(sentences) for epoch in range(10): model.train(sentences) model.alpha -= 0.002 # decrease the learning rate` model.min_alpha = model.alpha # fix the learning rate, no decay model.save(my_model) #model_loaded = models.Doc2Vec.load(my_model) # ある文書に似ている文書を表示 #print ("SENT_0") #print (model.docvecs.most_similar(["SENT_0"]) ) #print ("SENT_3") #print (model.docvecs.most_similar(["SENT_3"]) )
def calc_similarity(folder, doc1, doc2): # フォルダが存在しない場合エラーを返し終了する if not os.path.isdir("%s/%s" % (folder, doc1)): print("Not exist " + doc1) quit() if not os.path.isdir("%s/%s" % (folder, doc2)): print("Not exist " + doc2) quit() # 必要変数を定義 directory = os.getcwd() + "/" # model_path = "model/doc2vec/" # tourism内の入力されたフォルダを読み込む spot = os.listdir("%s/%s" % (folder, doc1)) spot = [sp for sp in spot if not sp == ".DS_Store"] tourist_spot = os.listdir("%s/%s" % (folder, doc2)) tourist_spot = [tsp for tsp in tourist_spot if not tsp == ".DS_Store"] # tmpフォルダを新たに作る(存在する場合は初期化) if os.path.isdir("%s/tmp" % folder): shutil.rmtree("%s/tmp" % folder) os.mkdir("%s/tmp" % folder) else: os.mkdir("%s/tmp" % folder) # 入力されたフォルダ内のフォルダ内のテキストファイルを読み込む spot_list = [] tourist_spot_list = [] for sp in spot: tmp = os.listdir("%s/%s/%s" % (folder, doc1, sp)) tmp = [fn for fn in tmp if fn[-4:] == ".txt"] tmp = [fn for fn in tmp if not fn[0] == "."] spot_list.append(tmp) for tsp in tourist_spot: tmp = os.listdir("%s/%s/%s" % (folder, doc2, tsp)) tmp = [fn for fn in tmp if fn[-4:] == ".txt"] tmp = [fn for fn in tmp if not fn[0] == "."] tourist_spot_list.append(tmp) print("copying...") # 入力されたフォルダ内のフォルダ内ののテキストファイルをコピーする # ファイルの中身を100字取り出しておく # file_list = [] text_list = [] for sp, sp_l in zip(spot, spot_list): end = len(sp_l) flag = 0 for tsp, tsp_l in zip(tourist_spot, tourist_spot_list): os.mkdir("%s/tmp/%s_%s" % (folder, sp, tsp)) for s in sp_l: shutil.copy("%s/%s/%s/%s" % (folder, doc1, sp, s), "%s/tmp/%s_%s/%s" % (folder, sp, tsp, s)) if flag in range(0, end): f = open("%s/%s/%s/%s" % (folder, doc1, sp, s), encoding="utf-8") text = f.read() text_list.append(text[:100]) f.close() flag += 1 for t in tsp_l: shutil.copy("%s/%s/%s/%s" % (folder, doc2, tsp, t), "%s/tmp/%s_%s/%s" % (folder, sp, tsp, t)) print("training...") morph_list, docs = [], [] sentences = [] model = [] # ラベル付け, doc2vecモデルの定義を行う # alpha: 学習率 / min_count: X回未満しか出てこない単語は無視 # size: ベクトルの次元数 / iter: 反復回数 / workers: 並列実行数 # dm: 1の場合dmpvを使用, それ以外はDBoWを使用する # window: Doc2Vecで前後何単語まで入力とするか for sp in spot: for tsp in tourist_spot: ml, dc = set_folder_morph("%s/tmp/%s_%s" % (folder, sp, tsp)) morph_list.append(ml) docs.append(dc) sentences.append(LabeledListSentence(ml, dc)) model.append( models.Doc2Vec(alpha=0.025, dm=1, window=10, min_count=0, size=50, iter=100, workers=4)) # doc2vecの学習前準備, 学習の実行 index = 0 for i in range(len(spot)): for j in range(len(tourist_spot)): model[index].build_vocab(sentences[index]) model[index].train(sentences[index], total_examples=model[index].corpus_count, epochs=model[index].iter) index += 1 if not os.path.isdir("./model"): os.mkdir("./model") if not os.path.isdir("./model/similarity"): os.mkdir("./model/similarity") # doc2vecモデルのセーブとロード index = 0 for sp in spot: for tsp in tourist_spot: model[index].save("./model/similarity/%s_%s.model" % (sp, tsp)) model[index] = models.Doc2Vec.load( "./model/similarity/%s_%s.model" % (sp, tsp)) index += 1 print("calculating...") # 計算結果を格納するためのゼロ行列を作成 spot_len = 0 file_name = [] label = [] index = 0 for sp_l in spot_list: spot_len += len(sp_l) for fn in sp_l: file_name.append(fn) label.append(spot[index]) index += 1 DOC_SIM = np.zeros((spot_len, len(tourist_spot))) # 類似度の計算 index = 0 start = 0 end = 0 for i, sp in enumerate(spot): start = end end += len(spot_list[i]) for j, tsp in enumerate(tourist_spot): os.chdir(directory + "%s/tmp/%s_%s" % (folder, sp, tsp)) flag = start for f1 in spot_list[i]: for f2 in tourist_spot_list[j]: DOC_SIM[flag, j] += model[index].docvecs.similarity( d1=f1, d2=f2) / len(tourist_spot_list[j]) flag += 1 index += 1 os.chdir(directory) if not os.path.isdir("./similarity"): os.mkdir("./similarity") # 計算結果を出力するための準備 DOC_SIM_DF = pd.DataFrame(np.c_[file_name, text_list, DOC_SIM]) DOC_SIM_DF.index = label DOC_SIM_DF.columns = ["file_name", "text"] + tourist_spot for tsp in tourist_spot: DOC_SIM_DF[[tsp]] = DOC_SIM_DF[[tsp]].astype(float) # 文書集合の類似度を出力 # 1に近いほど似ている,0に近いほど似ていない DOC_SIM_DF.to_excel("./similarity/%s_%s.xlsx" % (doc1, doc2), encoding="shift-jis") print("Done.")
def similarity(self, label, items): if not self.model: self.model = models.Doc2Vec(self._gen_docs(self.docs), min_count=20, workers=4) return self.model.most_similar(label, topn=items)
# レポートの各行から、動詞・形容詞・名詞(数を除く)の情報のみを取り出す if len(chunks) > 3 and ( chunks[3].startswith('動詞') or chunks[3].startswith('形容詞') or (chunks[3].startswith('名詞') and not chunks[3].startswith('名詞-数'))): words.append(chunks[0]) sentences.append(TaggedDocument(words=words, tags=doc.tags)) # 学習モデルを生成 # size: ベクトル化した際の次元数 # alpha: 学習率 # sample:単語を無視する際の頻度の閾値 # min_count:学習に使う単語の最低出現回数 # workers:学習時のスレッド数 model = models.Doc2Vec(size=400, alpha=0.0015, sample=1e-4, min_count=1, workers=4) # Doc2Vecに単語を登録 model.build_vocab(sentences) # 学習評価に使用するサンプル数と閾値を設定 # ※Python3の四捨五入は「最近接偶数への丸め」となっていることに注意(1.5も2.5も2に丸められる) sample_num = int(round(len(sentences) * SAMPLE_PROPORTION, 0)) passing_thres = int(round(sample_num * PASSING_PRECISION, 0)) for x in range(TRAIN_MAX): print(x) # 学習実施 model.train(sentences,
def main(): parser = argparse.ArgumentParser(description='Doc2vec driver.') parser.add_argument('mode', choices=['train', 'retrieve', 'infer'], help='Training, retrieve trained embeddings' 'or inference mode') parser.add_argument('name', type=str, help='Model name') parser.add_argument('output_path', type=str, help='Output path') parser.add_argument('word_embeddings_path', type=str, help='Pre-trained word embeddings path') parser.add_argument('tokenized_path', type=str, help='Directory with tokenized plain text documents') args = parser.parse_args() if None in [args.mode, args.name, args.output_path]: exit('Arguments mode, name, output_path are required') if args.mode == 'train' and (args.word_embeddings_path is None or args.tokenized_path is None): exit('word_embeddings_path and tokenized_path arguments are required if mode is set to train') if args.mode == 'infer' and args.tokenized_path is None: exit('word_embeddings_path and tokenized_path arguments are required if mode is set to train') if args.mode == 'train': print("Entering training mode") pretrained_emb = args.word_embeddings_path tokenized_path = args.tokenized_path texts = [] for filename in os.listdir(tokenized_path): with codecs.open(os.path.join(tokenized_path, filename), 'r', 'utf-8') as f: doc_tokens = [] for line in f.readlines(): if len(line) > 0: doc_tokens += line.split() texts.append(doc_tokens) #texts.append([line.split() for line in f.readlines()]) print(len(texts)) #doc2vec parameters vector_size = 300 window_size = 15 min_count = 1 sampling_threshold = 1e-5 negative_size = 5 train_epoch = 5 dm = 0 #0 = dbow; 1 = dmpv worker_count = 6 #number of parallel processes saved_path = os.path.join(args.output_path, 'models', args.name + '.bin') #enable logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) mkdir_p(os.path.join(args.output_path, 'models')) print("created models directory") docs = [g.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(texts)] print("Let's start training") model = g.Doc2Vec(docs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, pretrained_emb=pretrained_emb, iter=train_epoch) print("Trained doc2vec") model.save(saved_path) elif args.mode == 'retrieve': saved_path = os.path.join(args.output_path, 'models', args.name + '.bin') model = g.Doc2Vec.load(saved_path) vectors = [] for i in range(len(model.docvecs)): vectors.append(model.docvecs[i]) vectors = np.array(vectors) np.save(os.path.join(args.output_path, args.name + '_vectors'), vectors) # print model.docvecs[i] else: # infer # inference hyper - parameters start_alpha = 0.01 infer_epoch = 1000 # load model m = g.Doc2Vec.load(os.path.join(args.output_path, 'models', args.name + '.bin')) tokenized_path = args.tokenized_path texts = [] for filename in os.listdir(tokenized_path): with codecs.open(os.path.join(tokenized_path, filename), 'r', 'utf-8') as f: texts.append([line.split() for line in f.readlines()]) test_docs = texts # infer test vectors for d in test_docs: print ' '.join([str(x) for x in m.infer_vector(d, alpha=start_alpha, steps=infer_epoch)]) + '\n'
(len(doc_list), len(train_docs), len(test_docs))) name = sys.argv[1] dist_mem = int(sys.argv[2]) hier_soft = int(sys.argv[3]) neg = int(sys.argv[4]) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) fname = str(name) + ".doc2vec" if path.isfile(fname): model = models.Doc2Vec.load(fname) else: model = models.Doc2Vec(size=100, window=5, min_count=5, workers=4, negative=neg, hs=hier_soft, dm=dist_mem) model.save(fname) model.build_vocab(alldocs) model.train(alldocs) doc_id = 24 # np.random.randint(model.docvecs.count) # pick random doc, re-run cell for more examples # model = np.random.choice(model) # and a random model sims = model.docvecs.most_similar( doc_id, topn=model.docvecs.count) # get *all* similar documents f = open(name + '.out', 'w') f.write(u'TARGET (%d): <%s>\n' % (doc_id, ' '.join(alldocs[doc_id].words))) f.write(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
originals.append(cols[0]) sentences.append( models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess( mecab.parse(cols[0]).strip(), min_len=1), tags=["SENT_" + str(j)])) j += 1 stop_words = [] if args.stop_words: for line in open(args.stop_words, "r", encoding="utf-8"): stop_words.append(line.strip()) vectorizer = TfidfVectorizer(token_pattern="(?u)\\b\\w+\\b", stop_words=stop_words) model = models.Doc2Vec(vector_size=400, windows=5, min_count=5, epochs=100) model.build_vocab(sentences) """ print('\n訓練開始') for epoch in range(51): print('Epoch: {}'.format(epoch + 1)) model.train(sentences, epochs=model.epochs, total_examples=model.corpus_count) if epoch%5==0: model_str="jamQ_model400_doc2vec_"+str(epoch) model.save(model_str) """ model_str = "jamQ_model400_doc2vec_50" model = models.Doc2Vec.load(model_str) doc_vecs = []