def wordVector(doc): sentences = doc2vec.TaggedLineDocument(doc) model = doc2vec.Doc2Vec(size=100, window=300, min_count=5, workers=10) model.build_vocab(sentences) model.train(sentences) return model
def train_doc2vec(model_dir, train_file, train_params): import logging import multiprocessing logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) documents = doc2vec.TaggedLineDocument(train_file) size = train_params["size"] window = train_params["window"] min_count = train_params["min_count"] workers = multiprocessing.cpu_count() // 2 epochs = train_params["epochs"] alpha = train_params["alpha"] min_alpha = train_params["min_alpha"] model = doc2vec.Doc2Vec(documents, vector_size=size, window=window, min_count=min_count, workers=workers, epochs=epochs, alpha=alpha, min_alpha=min_alpha) if not os.path.exists(os.path.dirname(model_dir)): try: os.makedirs(os.path.dirname(model_dir)) except OSError as exc: if exc.errno != errno.EEXIST: raise model.save(model_dir)
def neighborhood_embedding(args): inputDir = args.preprocessed_input # outputFile = args.output iterations = args.iter dimensions = args.d window = args.windowSize dm = 1 if args.model == 'dm' else 0 indexToName = generateWalkFile(inputDir, args.walkLength) # print(indexToName) sentences = doc.TaggedLineDocument(inputDir+'.walk') with open('log', 'a+') as f: results = [] # for epochs in range(10, 110, 10): # print('epochs', epochs) model = doc.Doc2Vec(sentences, vector_size = dimensions, dm = dm, window = window ) vectors = model.docvecs embeddings = [[] for _ in range(len(vectors))] for i in range(len(vectors)): embeddings[int(indexToName[i])] = vectors[i] from preprocess import evaluate res = evaluate(args.input, embeddings) results.append(str(res)) print(res) f.write(inputDir + ',n,' + ','.join(results) + '\n')
def writeDoc2vecSimMatrix(outfile, allTweets, results, create): if create: outfile1 = os.path.dirname(outfile) + "/Doc2vecModelTokens.txt" pos_tweets = tagger.runtagger_parse( allTweets) #tokenizer and POS-tagger tokens = makeDoc2vecFile(pos_tweets, outfile1, False) sentence = doc2vec.TaggedLineDocument( outfile1) #Imports in doc2vec format model = doc2vec.Doc2Vec(sentence, size=100, window=300, min_count=10, workers=4) #makes doc2vec model model_name = os.path.dirname(outfile) + "/Doc2vecModel.txt" model.save(model_name) else: model_name = os.path.dirname(outfile) + "/Doc2vecModel.txt" model = doc2vec.Doc2Vec.load(model_name) for i in range(0, len(allTweets)): x = [] for result in results: k = allTweets.index(result) x.append(str(model.docvecs.similarity(i, k))) with open(outfile, "a+") as f: writeFile = csv.writer(f) writeFile.writerow(x)
def structural_embedding(input_dir, iterations=20, dimensions=128, windowSize=2, dm=1, walkLength=64): # index_to_name = generate_walk_file(input_dir, walkLength, 0.5) walk_dir_path = input_dir.replace('sub_graphs', 'walks') walk_dir_path = (os.path.join(walk_dir_path, 'walk_file.walk')) sentences = doc.TaggedLineDocument(walk_dir_path) model = doc.Doc2Vec(vector_size=50, epochs=40, size=dimensions, iter=iterations, dm=dm, window=windowSize, min_count=1) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) return list(model.docvecs.vectors_docs)
def main(): logging.basicConfig(format='[%(levelname)s] : %(message)s', level=logging.INFO) sentences = doc2vec.TaggedLineDocument('output/fb_article_seg.txt') model = doc2vec.Doc2Vec(sentences, size=100, window=3) #model.train(sentences) model.save('output/doc2vec.model')
def make_model(self): documents = doc2vec.TaggedLineDocument(self.text_path) model = doc2vec.Doc2Vec(documents, size=self.dimension, dm=0, min_count=1, iter=10, window=2) model.save(self.text_model_path)
def kmeans_doc2vec(file_name): sentences = doc2vec.TaggedLineDocument(file_name) model = doc2vec.Doc2Vec(sentences, # 语料集 size=40, # 是指特征向量的维度 window=3, # 表示当前词与预测词在一个句子中的最大距离是多少 ) model.save_word2vec_format("doc2vec_result.txt") model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) num_clusters = 10 km = KMeans(n_clusters=10, init='k-means++', max_iter=300, n_init=1, verbose=False, random_state=0) result_doc2vec = list(km.fit_predict(model.docvecs)) return result_doc2vec
def generate_doc2vec_model(target_game_name): print("Training Start") # カードテキスト読み込み card_text = doc2vec.TaggedLineDocument(target_game_name + ".txt") # 学習 model = doc2vec.Doc2Vec(card_text, size=300, window=8, min_count=1, workers=4, iter=400, dbow_words=1, negative=5) # モデルの保存 model.save(target_game_name + ".model") print("Training Finish") return model
def gen_d2v_corpus(lines, savemodel, istran=False): total_examples = len(lines) with open("../data/ques2_result.txt", "wb") as fw: for line in lines: txt = " ".join(jieba.lcut(line)) + "\n" txt = txt.encode('utf-8') fw.write(txt) sents = doc2vec.TaggedLineDocument("../data/ques2_result.txt") model = None if os.path.exists(savemodel): print('loading model', savemodel, time.asctime()) model = doc2vec.Doc2Vec.load(savemodel) print('loaded model', savemodel, time.asctime()) if istran: count = 0 while (True): count += 1 epoches = 20 model.train(sents, total_examples=total_examples, epochs=epoches) if count % 10: model.save(savemodel + "." + str(count)) model.save(savemodel) print('trained ', count * epoches) else: print('train new model') model = doc2vec.Doc2Vec(sents, size=300, window=12, min_count=2, workers=4, dm=0) print('train', time.asctime()) model.train(sents, total_examples=total_examples, epochs=200) print('train', time.asctime()) model.save(savemodel) save_path = '../data/query.doc2vec.txt' write_to_file(save_path, "".encode('utf-8'), mode='wb+') for i in range(100): vs = model.docvecs.most_similar(i) for v in vs[:10]: result_indx = v[0] distance = v[1] txt = '{} {} {} {} {} {}\n'.format(i, lines[i], "->", result_indx, lines[result_indx], distance) write_to_file(save_path, txt.encode('utf-8')) write_to_file(save_path, "\n".encode('utf-8'))
def neighborhood_embedding(input_dir, iterations=20, dimensions=128, windowSize=2, dm=1, walkLength=64): index_to_name = generate_walk_file(input_dir, walkLength) walk_dir_path = input_dir.replace("\\sub_graphs\\", "\\walks\\") sentences = doc.TaggedLineDocument(walk_dir_path + '.walk') model = doc.Doc2Vec(sentences, size=dimensions, iter=iterations, dm=dm, window=windowSize) return list(model.docvecs.vectors_docs), index_to_name
def structural_embedding(args): inputDir = args.input outputFile = args.output iterations = args.iter dimensions = args.d window = args.windowSize dm = 1 if args.model == 'dm' else 0 indexToName = generateWalkFile(inputDir, args.walkLength, args.p) sentences = doc.TaggedLineDocument(inputDir + '.walk') model = doc.Doc2Vec(sentences, size=dimensions, iter=iterations, dm=dm, window=window) saveVectors(list(model.docvecs), outputFile, indexToName)
def get_testset_update_trainning(para=True): doc_test_labels_preprocess = 'test_labels_level1_with_label_preprocess.txt' if para == True: with open(path_main + doc_test_labels, 'r', encoding='utf-8') as f: contents = f.readlines() with open(path_main + doc_test_labels_preprocess, 'w', encoding='utf-8') as outf: for line in contents: if len(line.strip()) < 5: continue else: outf.write(line) else: pass sentences = d2v.TaggedLineDocument(path_main + doc_test_labels_preprocess) return sentences
def neighborhood_embedding(args): inputDir = args.input outputFile = args.output iterations = args.iter dimensions = args.d window = args.windowSize dm = 1 if args.model == 'dm' else 0 indexToName = generateWalkFile(inputDir, args.walkLength) sentences = doc.TaggedLineDocument(inputDir + '.walk') model = doc.Doc2Vec(sentences, vector_size=dimensions, epochs=iterations, dm=dm, window=window) print( "Neighborhood output generated at the end of neighborhood.py -- example embedding of the first subgraph:" ) print(model.docvecs[0])
def build_d2v_model(file_location, model_name, do_train="True"): documents = doc2vec.TaggedLineDocument(file_location) if (do_train == "True"): model = doc2vec.Doc2Vec(documents, size=200, window=5, min_count=3, workers=8, iter=20) model.save(model_name) else: model = doc2vec.Doc2Vec.load(model_name) model.init_sims(replace=True) return model
def run(self, conf_data): try: # init parms for doc2vec node self._init_node_parm(conf_data['node_id']) self.cls_pool = conf_data['cls_pool'] # get prev node for load data data_node_name = self._get_backward_node_with_type( conf_data['node_id'], 'preprocess') train_data_set = self.cls_pool[data_node_name[0]] # load model for train update_flag = False model = doc2vec.Doc2Vec(size=self.vector_size, window=self.window_size) if (os.path.exists(''.join([self.md_store_path, '/model.bin'])) == True): model = doc2vec.Doc2Vec.load(''.join( [self.md_store_path, '/model.bin'])) update_flag = True # train vocab and model while (train_data_set.has_next()): train_data = doc2vec.TaggedLineDocument( train_data_set.train_file_path()) for x in range(0, self.iter_size): if (update_flag == False): model.build_vocab(train_data, update=False) update_flag = True else: model.build_vocab(train_data, update=True) model.train(train_data) train_data_set.next() os.makedirs(self.md_store_path, exist_ok=True) model.save(''.join([self.md_store_path, '/model.bin'])) return len(model.raw_vocab) except Exception as e: logging.info("[Doc2Vector Train Process] : {0}".format(e)) raise Exception(e)
def graph_structural_embedding(self, graphs, **kwargs): dirName = 'data/output/sub2vec_output/' if not os.path.isdir(dirName): os.makedirs(dirName) file_name = os.path.join(dirName, 'random_walk_file.walk') indexToName = structural.generateWalkFile(graphs, file_name, kwargs['walkLength'], kwargs['alpha'], kwargs['randomWalkCount']) sentences = doc.TaggedLineDocument(file_name) print('build model') model = doc.Doc2Vec(sentences, vector_size=kwargs['dimensions'], epochs=kwargs['iterations'], dm=kwargs['dm'], window=kwargs['window']) # outputfile = os.path.join(dirName, 'vectors.vec') # print('save vectores') # structural.saveVectors(model.docvecs, outputfile, indexToName) return model.docvecs
def structural_embedding(args): inputDir = args.input print(inputDir) outputFile = args.output iterations = args.iter dimensions = args.d window = args.windowSize dm = 1 if args.model == 'dm' else 0 indexToName = generateWalkFile(inputDir, args.walkLength, args.p) # just makes walks sentences = doc.TaggedLineDocument(inputDir + '.walk') model = doc.Doc2Vec(sentences, vector_size=dimensions, epochs=iterations, dm=dm, window=window) print("Total vects ", len(list(model.docvecs.vectors_docs))) #model.docvecs saveVectors(list(model.docvecs.vectors_docs), outputFile, indexToName)
def train_doc2vec(): conf = Config() sentences = doc2vec.TaggedLineDocument(conf.train_path) # build voca model = doc2vec.Doc2Vec(min_count=conf.word_min_count, vector_size=conf.vector_size, alpha=conf.learning_rate, negative=conf.negative_size, epochs=conf.train_epoch, window_size=conf.window_size, min_alpha=conf.learning_rate, seed=1234, workers=conf.worker_count) model.build_vocab(sentences) # Train document vectors model.train(sentences, epochs=model.iter, total_examples=model.corpus_count) # To save if not os.path.isdir(conf.model_path): os.mkdir(conf.model_path) model.save(conf.modelfile)
def calculateEmbeddings(self): ''' calculates gensim doc2vec model and returns resulting embedding vectors''' if self.walks is None: print("Generate the walks first!") exit else: # save walks in a file walkFile = open(self.output + '.walk', 'w') for walk in self.walks: walkFile.write(arr2str(walk) + "\n") walkFile.close() sentences = doc.TaggedLineDocument(self.output + '.walk') if self.model == 'dm': self.model = 1 else: self.model = 0 model = doc.Doc2Vec(sentences, vector_size=128, epochs=100, dm=self.model, window=1) print("Total vects ", len(list(model.docvecs.vectors_docs))) # model.docvecs return model.docvecs.vectors_docs
test_docvec_file = "20news-test-7532.svm-doc2vec.txt" all_count = 18791 train_count = 11285 test_count = 7506 else: all_words_file = "reuters-all-8025.gibbslda-bow.txt" train_label_file = "reuters-train-5770.slda-label.txt" train_docvec_file = "reuters-train-5770.svm-doc2vec.txt" test_label_file = "reuters-test-2255.slda-label.txt" test_docvec_file = "reuters-test-2255.svm-doc2vec.txt" all_count = 8025 train_count = 5770 test_count = 2255 dim = 400 corpus = doc2vec.TaggedLineDocument(all_words_file) model = doc2vec.Doc2Vec(corpus,size=dim, window=8, min_count=5, workers=4) TRAIN_DOC2VEC = open(train_docvec_file, "w") TRAIN_LABEL = open(train_label_file) #pdb.set_trace() for d in xrange(1, train_count + 1): doc_vec = model.docvecs[d] label_line = TRAIN_LABEL.readline().strip() label = int(label_line) TRAIN_DOC2VEC.write( "%d" %(label+1) ) for k in xrange(dim): TRAIN_DOC2VEC.write( " %d:%.3f" %( k + 1, doc_vec[k] ) )
cores = multiprocessing.cpu_count() vector_size = 300 window_size = 15 word_min_count = 2 sampling_threshold = 1e-5 negative_size = 5 train_epoch = 100 dm = 1 #0 = dbow; 1 = dmpv worker_count = cores #number of parallel processes inputfile = r"D:\user\Desktop\project\wiki_pos_tokenizer_without_taginfo.txt" modelfile = r"D:\user\Desktop\project\wiki_pos_tokenizer_without_taginfo.doc2vec.model" word2vec_file = modelfile + ".word2vec_format" sentences = doc2vec.TaggedLineDocument(inputfile) #build voca model = doc2vec.Doc2Vec(min_count=word_min_count, vector_size=vector_size, alpha=0.025, min_alpha=0.025, seed=1234, workers=worker_count) model.build_vocab(sentences) # Train document vectors model.train(sentences, epochs=model.iter, total_examples=model.corpus_count) # To save
#encoding=utf-8 import logging import sys import multiprocessing import numpy as np from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence from gensim.models import doc2vec from config import * if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) r = np.random.randint(100000,999999,size = (1,)) print (r[0]) sents = doc2vec.TaggedLineDocument("./all.txt") print (sents) model = doc2vec.Doc2Vec(sents, size = embedding_dims, window = 9, min_count=1, iter=45, hs=0, negative=11, seed=r[0],) model.wv.save_word2vec_format("w2v.txt", binary=False) model.save("d2v.model")
# print(label_dict) # label_=(sorted(label_dict.items(),key=lambda x:x[1])) # print(label_) y = zeros((12901, len(label_names))) f9 = codecs.open(filename, 'r', encoding="utf-8") i = 0 for li in f9.readlines(): li = li.split() for j in li: y[i, label_dict[j]] = 1 i += 1 return y sentences = doc2vec.TaggedLineDocument("result.txt") model = doc2vec.Doc2Vec(sentences, size=280, window=5, min_count=1, workers=8, iter=168) # model.build_vocab(sentences) model.train(sentences) filename = "label_level01.txt" corpus = model.docvecs y = y_label(filename) vector = []
return np.concatenate((u, v, u - v)) #np.concatenate((u, v, u-v)) #u-v ??? def randvec(w, n=50, lower=-0.5, upper=0.5): """Returns a random vector of length `n`. `w` is ignored.""" return np.array([random.uniform(lower, upper) for i in range(n)]) with open("parsed_data.txt", 'w') as c: count = 0 for doc in vocab: c.write(doc + "\n") doc2vecContent[doc] = count count += 1 sentences = doc2vec.TaggedLineDocument("parsed_data.txt") model = Doc2Vec(sentences, size=100, window=8, min_count=5, workers=4, iter=20) model.save("model_name") def get_vec_for_sentence(sentence): if sentence not in doc2vecContent: return "Error, sentence not found" return (model.docvecs[doc2vecContent[sentence]]) def build_dataset(dataset, vector_func, vector_combo_func=vec_concatenate): """ Parameters ---------- dataset
import logging import sys import multiprocessing import numpy as np from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence from gensim.models import doc2vec embedding_dims = 128 if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) r = np.random.randint(100000, 999999, size=(1, )) print(r[0]) sents = doc2vec.TaggedLineDocument("./fc.dat") print(sents) model = doc2vec.Doc2Vec(sents, size=embedding_dims, window=9, min_count=1, iter=45, hs=0, negative=11, seed=r[0]) model.wv.save_word2vec_format("w2v.txt", binary=False) # model.save("d2v.model")
df_file_records, nw_file_records = \ read_file_info_records(train_ere_dir, train_entity_info_dir, train_relation_info_dir, train_event_info_dir, train_em_args_dir) test_df_file_records = \ read_file_info_records(test_df_ere_dir, test_df_entity_info_dir, test_df_relation_info_dir, test_df_event_info_dir, test_df_em_args_dir, False) test_nw_file_records = \ read_file_info_records(test_nw_ere_dir, test_nw_entity_info_dir, test_nw_relation_info_dir, test_nw_event_info_dir, test_nw_em_args_dir, False) file_records = df_file_records + nw_file_records + test_df_file_records + test_nw_file_records contexts = get_contexts(file_records) # print 'Read external data...' imdb_texts, label = read_imdb_data(imdb_dir) print 'Write doctext...' texts = get_doc2vec_dataform(contexts + imdb_texts) # texts = get_doc2vec_dataform(contexts) write_doc2vec_input(texts, doctext_path) print 'Doc2vec...' docslist = doc2vec.TaggedLineDocument(doctext_path) model = Doc2Vec(docslist, workers=multiprocessing.cpu_count(), min_count=1, size=200) model.save(docmodel_path) model = Doc2Vec.load(docmodel_path) doc2vec_model = model.docvecs print doc2vec_model[0]
md = doc2vec.Doc2Vec( dm=0, # PV-DBOW / default 1 dbow_words=1, # w2v simultaneous with DBOW d2v / default 0 window=8, # distance between the predicted word and context words vector_size=100, # vector size alpha=0.025, # learning-rate seed=1234, min_count=-1, # ignore with freq lower min_alpha=0.025, # min learning-rate workers=cores, # multi cpu hs=1, # hierarchical softmax / default 0 negative=10, # negative sampling / default 5 ) #파일의 텍스트값에서 벡터 추출, 단어간 유사도 측정 sentences = doc2vec.TaggedLineDocument("news.json") md.build_vocab(sentences) print(str(md)) start = time.time() #단어 학습을 통해 유사도 모델 개선 md.train(sentences, epochs=md.iter, total_examples=md.corpus_count) #강화학습을 위한 반복문 """for epoch in range(10): md.train(sentences, total_examples=md.corpus_count, epochs=md.iter) md.alpha -= 0.002 # decrease the learning rate md.min_alpha = md.alpha # fix the learning rate, no decay""" end = time.time() print("During Time: {}".format(end - start))
# coding=utf-8 from gensim.models import doc2vec import sys reload(sys) sys.setdefaultencoding('utf-8') num = '1' documents = doc2vec.TaggedLineDocument('document5.txt') model = doc2vec.Doc2Vec(documents, size=500, window=1, min_count=500, workers=4) model.save('./document' + num + '.bin')
def structural_embedding(self, inputFile, outputFile): indexToName = self.generateWalkFile(inputFile, args.walkLength) sentences = doc.TaggedLineDocument(inputFile+'.walk') self.model = doc.Doc2Vec(sentences, size = dimensions, iter = iterations, window = window ) saveVectors(list(self.docvecs), outputFile, indexToName)