def setUp(self): filename = datapath("alldata-id-10.txt") train_docs = read_sentiment_docs(filename) self.train_docs = train_docs self.source_doc_vec_file = datapath("small_tag_doc_5_iter50") self.target_doc_vec_file = datapath("large_tag_doc_10_iter50") self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file) self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file)
def __init__(self, size=300, window=8, min_count=2, workers=8, path_to_model=None, stream_train=False): ''' Initializes the Doc2Vec_Wrapper class. Args: size (int): Specifies the size of the feature-vector. Defaults to 300 window (int): Specifies the size of the context window from which the feature vector is learned min_count (int): Specifices the minimum number of instances of each word that is saved in the model workers (int): number of parallel processes path_to_model (str): Specifies model on disk stream_train (bool): If true, update word vectors with new sentences. If false, just get doc vecs ''' self.stream_train=stream_train self.is_trained = False self.model = None ## if a path is passed, try to load from disk. Otherwise, retrain anyway if path_to_model: try: self.is_trained = True self.model = Doc2Vec.load(path_to_model) except: pass ## params for Doc2Vec self.size = size ## size of the vector self.window = window ## size of the context window self.min_count = min_count ## minimum count of vocab to store in binary tree self.workers = workers ## number of parallel processes == number of cores on the computer
def load_external(self, model_file_name): """ load a word2vec model from the file specified :param model_file_name: name of the model file :return: """ self.model = Doc2Vec.load(model_file_name)
def test_category(): from gensim.models.doc2vec import Doc2Vec from sematch.utility import FileIO from sematch.semantic.relatedness import ConceptRelatedness model_category = Doc2Vec.load(FileIO.filename('models/category/cat2vec')) cat2vec_rel = ConceptRelatedness(model_category) print(cat2vec_rel.word_similarity('happy','sad'))
def __init__(self, sentences, name, dataset_name, epochs=1, dimension=50, modelfile=None): self.inner_model = None # parameters self.dataset = dataset_name self.sentences = sentences self.name = name self.epochs = epochs self.dimension = dimension # data file path models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models']) if modelfile is not None: filename = modelfile else: filename = "DOC2VEC_%s_%s_%s_%s" % (self.dataset, self.name, self.epochs, self.dimension) self.filepath = os.path.join(models_folder, filename) model_exists = os.path.isfile(self.filepath) # train initial model if model_exists: logging.info("found data file %s" % (self.filepath, )) self.inner_model = Doc2Vec.load(self.filepath) else: self.inner_model = Doc2Vec(sentences, size=self.dimension) print self.inner_model.vocab.keys() self.inner_model.save(fname=self.filepath)
def do_command(args): # Load data data = load_data(args.input) #ids, documents = zip(*data) data = [(id, tokenize(doc)) for id, doc in data] ids = [id for id, _ in data] if not os.path.exists(args.modelfile): model = embed_documents(data) # Save model model.save(args.modelfile) else: model = Doc2Vec.load(args.modelfile) #map(model.infer_tokens, tokenized) print("Loaded model.") # Do k-nearest neighbors search. writer = csv.writer(args.output, delimiter='\t') writer.writerow(["id1", "id2", "score"]) count = int(args.count) if args.count > 0 else len(model.docvecs) vectors = np.array([model.docvecs[i] for i in range(count)]) del model # clear up memory for i, j, score in find_nearest_neighbors(vectors): id1, id2 = ids[i], ids[j] writer.writerow([id1, id2, score])
def varify(): from gensim.models.doc2vec import Doc2Vec model = Doc2Vec.load('data/doc2vec.d2v') documents = pickle.load(open('data/fedcorpus.pick', 'r')) for i in xrange(3): inferred_docvec = model.infer_vector(documents[i].words) print documents[i].tags print('%s:\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))
def get_model(model_num, model_names): if model_num < 10: model = Word2Vec.load(model_path + model_names) elif model_num < 99: model = Doc2Vec.load(model_path + model_names) else: model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True) # C text format return model
def create_and_train_models_d2vec(tag, cores=6): """ Build vocabulary and train models :param tag: small or big :param cores: number of cores :return: the current models """ simple_models = get_models_d2vec(cores) model_files = get_models_filename_d2vec(tag) if all([os.path.exists(file) for file in model_files]): print('Models exist, loading...') for i, fname in enumerate(model_files): simple_models[i] = Doc2Vec.load(fname) models_by_name = OrderedDict((str(model), model) for model in simple_models) return models_by_name else: print('Building models...') voc_model = build_vocab_d2vec(tag, cores) # Share vocabulary between models for model in simple_models: model.reset_from(voc_model) models_by_name = OrderedDict((str(model), model) for model in simple_models) print('Training models...') print("START %s" % datetime.datetime.now()) best_error = defaultdict(lambda: 1.0) # to selectively-print only best errors achieved alpha, min_alpha, passes = (0.025, 0.001, 20) alpha_delta = (alpha - min_alpha) / passes file = x_train_str.format(tag) x_train = pd.read_hdf(file) train_list = x_train.tolist() for epoch in range(passes): shuffle(train_list) # shuffling gets best results for name, train_model in models_by_name.items(): # train duration = 'na' train_model.alpha, train_model.min_alpha = alpha, alpha with elapsed_timer() as elapsed: train_model.train(CorpusStream(train_list, 'train'), total_examples=train_model.corpus_count, epochs=train_model.iter) duration = '%.1f' % elapsed() print('completed pass %i at alpha %f' % (epoch + 1, alpha)) alpha -= alpha_delta print("END %s" % str(datetime.datetime.now())) for name, model in models_by_name.items(): name = name.replace('/', '').replace(',', '_') model.save('models/{0}_{1}.m'.format(name, tag)) return models_by_name
def get_WordVector_matrix(label): model = Doc2Vec.load('./WordVector_model.d2v') size = len(label) vectors = np.zeros((size,depth)) for i in range(size): try: doc_vector = model.docvecs[str(i)] vectors[i]=(doc_vector[0]) except KeyError: print str(i) + ' occurs KeyError' pass return map(list,vectors)
def test_models( FULL_SIM, models_files ): test_papers = pd.read_csv( TEST_FILEPATH ) # NOTE: Only need for testing with AII: keywords_docsrels = populate_iks_dict() authorities = initialize_authorities() for mod_f in models_files: print( 'Testing '+ mod_f ) model = Doc2Vec.load( mod_f ) print( 'Model loaded.' ) test_model( FULL_SIM, model, test_papers, keywords_docsrels, authorities )
def build_model(x_train, x_test, iteration =5, save=True): if(save): big_list = x_train + x_test model = Doc2Vec(min_count=2, window=10, size=100, sample=1e-4, negative=5, workers=8) model.build_vocab(big_list) for i in range(iteration): model.train(big_list) print 'saving model to file.....' model.save('./sentim.d2v') else: print 'loading model from file.....' model = Doc2Vec.load('./sentim.d2v') return model
def get_vec(vector_file, id_file, w_file): p2v = Doc2Vec.load(vector_file) fout = open(w_file, "w") index = 0 with open(id_file) as f: for line in f: index += 1 if index % 1000 == 0: logging("%d cases" % index) line = line.strip() vec = p2v.docvecs[line] line_w = line + "\t" + "\t".join([str(x) for x in vec]) + "\t" + "\n" fout.write(line_w) fout.close()
def datacluster(data): infered_vectors_list = [] print "load model..." model_dm = Doc2Vec.load(model_path) print "load train vectors..." for text, label in data: vector = model_dm.infer_vector(text) infered_vectors_list.append(vector) ''' print "Check the optimized parameter..." Nc = range(1, 50) pca_data = [PCA(n_components = i).fit(infered_vectors_list).transform(infered_vectors_list) for i in Nc] kmeans = cluster.KMeans(init='k-means++',n_clusters=20,max_iter=300) score = [kmeans.fit(pca_data[i]).score(pca_data[i]) for i in range(len(pca_data))] print score plt.plot(Nc,score) plt.xlabel('PCA components') plt.ylabel('Score') plt.title('Elbow Curve') plt.show() ''' print "PCA decomposition..." pca = PCA(n_components = 10).fit(infered_vectors_list) pca_data = pca.transform(infered_vectors_list) print "train K-Means model..." kmean_model = cluster.KMeans(init='k-means++',n_clusters=16,max_iter=300) kmean_model.fit(pca_data) #get the classified index result = kmean_model.fit_predict(pca_data) print "Predicting result:", result #save the cluster result joblib.dump(kmean_model, cluster_path) #load the cluster result # new_km = joblib.load(cluster_path) numSamples = len(pca_data) print numSamples centroids = kmean_model.labels_ #print centroids,type(centroids) #显示中心点 #print kmean_model.inertia_ #显示聚类效果 ''' marker = ['o', '.', ',', 'x', '*', 'd', 's', 'p'] color = ['r', 'g', 'b', 'c', 'm', 'k', 'y', 'w'] for i in xrange(numSamples): plt.scatter(pca_data[i][0], pca_data[i][1], \ marker=marker[centroids[i]], color=color[centroids[i]]) plt.show() ''' return centroids
def main(): #load data set training_reviews = load_dataset(TRAIN_FILE) testing_reviews = load_dataset(TEST_FILE) #load doc2vec model doc2vec_model = Doc2Vec.load(DOC2VEC_MODEL) cate_index = get_all_categories(training_reviews) cates = dict2list(cate_index) n_cates = len(cates) train_X = get_X(training_reviews, doc2vec_model) test_X = get_X(testing_reviews, doc2vec_model) train_labels = get_labels(training_reviews, cate_index) test_labels = get_labels(testing_reviews, cate_index) labelwise_acc = [] labelwise_output = [] for cate in range(n_cates): # train a bonary model train_Y = get_Y(train_labels, cate) prob = svm_problem(train_Y, train_X) param = svm_parameter("-s 0 -t 2 -b 1") m = svm_train(prob, param) # test test_Y = get_Y(test_labels, cate) p_label, p_acc, p_val = svm_predict(test_Y, test_X, m, '-b 1') labelwise_acc.append(p_acc) labelwise_output.append(p_label) # evaluation p, r, f = microF1(labelwise_output, test_labels) # output out_dir = "../data/use_doc2vec/" out_file = out_dir + "laptop.txt" labelwise_acc = [(cates[i], labelwise_acc[i][0]) for i in range(n_cates)] labelwise_acc = sorted(labelwise_acc, key=lambda x:x[1]) with open(out_file, 'w') as out: out.write("Precision:\t{}\nRecall:\t{}\nF1:\t{}\n".format(p, r, f)) print("{}\n{}\n{}".format(p, r, f)) for cate_i in range(n_cates): out.write("{}:\t{}\n".format(labelwise_acc[cate_i][0], labelwise_acc[cate_i][1]))
def get_distances_subset(n_closest, category_hash_with_doc_ids, csv_path): # example # category_hash_with_doc_ids = {"cat1":["us-1", "us-2"], "cat2": ["us-3"]} # loop over subjects and average docvecs belonging to subject. # place in dictionary model = Doc2Vec.load('../doc2vec_model') cpc_vectors = get_category_vectors_subset(model, category_hash_with_doc_ids) distance_mat = get_distance_mat(cpc_vectors) to_csv = [] for subj_id in list(category_hash_with_doc_ids.keys()): relateds = get_n_closest(distance_mat, subj_id, n=n_closest) for related_id, dist in relateds.iteritems(): weight = round(1./dist) #weight = round((1-dist) * 10) row = (subj_id, related_id, weight, subj_id, related_id) to_csv.append(row) edges = pd.DataFrame(to_csv, columns=['source', 'target', 'weight', 'source_name', 'target_name']) edges.to_csv(csv_path, index=False)
def test(): global english_punctuation, model_path new_model = Doc2Vec.load(model_path) # sentence = "reserve setup_data: [mem 0x000000008f889018-0x000000008f8bc057] usable" # sentence = "efi: mem14: type=2, attr=0xf, range=[0x000000008fa17000-0x000000008fb19000) (1MB)" # sentence = "pci 0000:07:08.2: [8086:208d] type 00 class 0x088000" # sentence = "i40e 0000:b0:00.2: irq 41 for MSI/MSI-X" sentence = "ata8: SATA link up 6.0 Gbps (SStatus 133 SControl 300)" #tokenize test_tokenized = [word.lower() for word in word_tokenize(sentence)] #remove stopwords english_stopwords = stopwords.words('english') test_stopwords = [word for word in test_tokenized if not word in english_stopwords] #remove punctuation test_punctuation = [word for word in test_stopwords if not word in english_punctuations] #stem words #st = PorterStemmer() #test_stemmed = [st.stem(word) for word in test_punctuation] test_text = test_punctuation print "===>Testing sentence:", test_text inferred_vector_dm = new_model.infer_vector(test_text) sims = new_model.docvecs.most_similar(positive=[inferred_vector_dm]) return sims
def __init__(self, model_name=None, corpus=None, stop_words=False, filename=None, **kwargs): """ model_name: name of the model which has been trained and saved corpus: dictionary with 'question' and 'answer', where corpus['question'] is a list of TaggedDocuments filename: name of file containing the questions dataset """ if corpus: self.corpus = corpus else: self.corpus = {} self.corpus['question'] = list(self.read_corpus(filename['question'], stop_words=stop_words)) self.corpus['answer'] = list(self.read_corpus(filename['answer'], stop_words=stop_words)) if model_name: self.model = Doc2Vec.load(model_name) else: size = kwargs.get('size', 50) min_count = kwargs.get('min_count', 5) alpha = kwargs.get('alpha', 0.025) min_alpha = kwargs.get('min_alpha', 0.025) iters = kwargs.get('iters', 10) self.train(size=size, min_count=min_count, alpha=alpha, min_alpha=min_alpha, iters=iters)
from flask import render_template from flask import redirect from flask import send_from_directory from flaskexample import app from flask import request #some functions defined under doc2vec.py file from flaskexample import doc2vec import gensim import os.path import pandas as pd from gensim.models.doc2vec import Doc2Vec, TaggedDocument mypath = os.path.abspath(os.path.dirname("data_clean.csv")) path = os.path.join(mypath, "flaskexample/doc2vec_model") model = Doc2Vec.load(path) @app.route('/') @app.route('/index') def index(): return render_template("index.html") @app.route('/search_story') def search_story(): return render_template("search_story.html") @app.route('/channel') def channel(): return redirect("https://www.youtube.com/channel/UCWENB1OaGA9402PKzEVl0ow")
filename = "IR_training_dump.txt" class LabeledLineSentence(object): def __init__(self, filename): self.filename = filename def __iter__(self): for line in open(filename, "r"): string = line.strip().split('\t') yield LabeledSentence(words=string[1].split(), tags=string[0]) #it = LabeledLineSentence(filename) fname = 'my_model2HUGE.doc2vec' model = Doc2Vec.load(fname) # you can continue training with the loaded model! cores = multiprocessing.cpu_count() #model =Doc2Vec(dm=1, dm_mean=1, size=200, window=8, min_count=19, iter =10, workers=cores) #model.build_vocab(it) #model.train(it, total_examples=model.corpus_count, epochs=model.iter)# #model.save('my_model2HUGE.doc2vec') answer = model.docvecs.most_similar(positive=["775DE74B"], topn=30) print(len(answer)) for i in answer: print(i[0])
def _ds2v_vector(requests, model_dir): model = Doc2Vec.load(model_dir) vectors = list(map(lambda x: model.infer_vector(x), requests)) return vectors
def sen2VecAvg(algo=5): print("Using Avg of sentence vectors") model = Doc2Vec.load('my_model_sens.doc2vec') postrfiles = glob.glob("../asgn2data/aclImdb/train/pos/*.txt") negtrfiles = glob.glob("../asgn2data/aclImdb/train/neg/*.txt") postsfiles = glob.glob("../asgn2data/aclImdb/test/pos/*.txt") negtsfiles = glob.glob("../asgn2data/aclImdb/test/neg/*.txt") x = np.zeros((25000, 100)) xt = np.zeros((25000, 100)) y = np.zeros(25000) yt = np.zeros(25000) i = 0 for f in postrfiles: with open(f, 'r') as curfile: data = curfile.read().decode("utf-8") data = sent_normalize_text(data) sens = nltk.sent_tokenize(data) for j in range(len(sens)): x[i] += model[f + 'SENT_{}'.format(j)] x[i] = x[i] / len(sens) y[i] = 10 i += 1 for f in negtrfiles: with open(f, 'r') as curfile: data = curfile.read().decode("utf-8") data = sent_normalize_text(data) sens = nltk.sent_tokenize(data) for j in range(len(sens)): x[i] += model[f + 'SENT_{}'.format(j)] x[i] = x[i] / len(sens) y[i] = 0 i += 1 i = 0 for f in postsfiles: with open(f, 'r') as curfile: data = curfile.read().decode("utf-8") data = sent_normalize_text(data) sens = nltk.sent_tokenize(data) for j in range(len(sens)): xt[i] += model[f + 'SENT_{}'.format(j)] xt[i] = xt[i] / len(sens) yt[i] = 10 i += 1 for f in negtsfiles: with open(f, 'r') as curfile: data = curfile.read().decode("utf-8") data = sent_normalize_text(data) sens = nltk.sent_tokenize(data) for j in range(len(sens)): xt[i] += model[f + 'SENT_{}'.format(j)] xt[i] = xt[i] / len(sens) yt[i] = 0 i += 1 combined = list(zip(x, y)) random.shuffle(combined) x[:], y[:] = zip(*combined) return x, xt, y, yt
def loadModel(filename="vec.model"): return Doc2Vec.load(filename)
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(token_list)] cores = multiprocessing.cpu_count() model = Doc2Vec(dm=0, dbow_words=1, size=300, window=10, min_count=2, iter=10000, workers=cores) fname = get_tmpfile("my_doc2vec_model") try: model = Doc2Vec.load(fname) except: model.build_vocab(documents) print("inicio do treino") model.train(documents, total_examples=model.corpus_count, epochs=model.iter) print("fim do treino") model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) model.save(fname) phrase = "Ciro" tokens = nltk.word_tokenize(phrase)
rects1 = plt.barh(index, compareMeanDf['Best Group'], bar_width, alpha=opacity, color='b', label='Best Group') rects2 = plt.barh(index + bar_width, compareMeanDf['Worst Group'], bar_width, alpha=opacity, color='g', label='Worst Group') plt.ylabel('Course Materials', fontsize=20) plt.xlabel('Average number of activities', fontsize=20) plt.title('') plt.yticks(index + bar_width, compareMeanDf.Material, fontsize=18) plt.xticks(fontsize=20) plt.legend(fontsize=25) #-------------------------------------------------------------- #======== Code analysis ------------------------- #------------------------------------------ from gensim.models.doc2vec import Doc2Vec, TaggedDocument from sklearn.metrics.pairwise import cosine_similarity model= Doc2Vec.load(basePath + "ca116_2vecSize50.model") a = model.docvecs[0] a = np.array(list(a) + list(a)) taskList = dataUpload['task'].unique() transitionDataMatrixWeeks[10].index[1] def similarityBetweenTwoStudent(studentId1, studentId2, doc2vecModel, taskList): vectorStudent1 = [] vectorStudent2 = [] for t in taskList: key1 = studentId1+'*'+t key2 = studentId2+'*'+t if (key1 in doc2vecModel.docvecs.index2entity) and (key2 in doc2vecModel.docvecs.index2entity): vectorStudent1 = vectorStudent1 + list(doc2vecModel.docvecs[key1]) vectorStudent2 = vectorStudent2 + list(doc2vecModel.docvecs[key2]) if len(vectorStudent1) and len(vectorStudent2) > 0:
from gensim.utils import tokenize from gensim import utils class MyIter(object): path = "" def __init__(self, fp): self.path = fp def __iter__(self): # path = datapath(self.path) with utils.open(self.path, 'r', encoding='utf-8') as fin: for line in fin: yield list(tokenize(line)) dataset_path = r"data\dataset_lower_clean_stem_sentence.csv" model_path = r"model\doc2vec100.bin" corpus = MyIter(dataset_path) documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)] d2v_model = Doc2Vec(vector_size=100, window=2, min_count=1, workers=4) d2v_model.build_vocab(documents) d2v_model.train(documents, total_words=d2v_model.corpus_count, epochs=d2v_model.epochs) d2v_model.save(model_path) model_path = r"model\doc2vec100.bin" d2v_model = Doc2Vec.load(model_path) print(d2v_model.wv.most_similar(["naskah", "dinas"]))
def readModel(datapath, modelName): global model print('Loading model from', datapath + '/model/' + modelName) model = Doc2Vec.load(datapath + '/model/' + modelName)
import hug from gensim.models.doc2vec import Doc2Vec from gensim.utils import simple_preprocess import re model = Doc2Vec.load('models/wiki-latest') @hug.get('/topicos', examples='frase=Vingadores são um grupo de super-heróis') @hug.local() def topicos(frase: str): """Informa os topicos de uma frase qualquer""" tokens = simple_preprocess(frase) inferred_vector = model.infer_vector(tokens) similars = model.docvecs.most_similar([inferred_vector], topn=10) return { 'topicos': similars } @hug.get(examples="expressao=homem está para rei como mulher está para") def analogia(expressao: str): """Calcula uma analogia entre termos""" entry = '{0}'.format(expressao) math_symbol = "\+" analogy_symbol = "está para" # Case 1: user wants to do word math: word1 - word2 + word3 positive = []
words_list = sentence.split(' ') array = np.array([w2vModel[word] for word in words_list if word in w2vModel]) df_SentenceVec = pd.Series(array.mean(axis=0)) return df_SentenceVec def train_D2V(d2vCorpus, embedSize=200, epoch_num=1): model_dm = Doc2Vec(d2vCorpus, min_count=1, window=3, size=embedSize, sample=1e-3, negative=5, workers=4) model_dm.train(d2vCorpus, total_examples=model_dm.corpus_count, epochs=epoch_num) model_dm.save("doc2vec.model") return model_dm model_dm = Doc2Vec.load("doc2vec.model") ###全局和局部变量 | 引用和复制########################################### #注意:python只有在“函数”,“类”里才会区分全局变量和局部变量,在if,for里面不会区分 ##全局和局部变量 b = [1,2,3] def func(): #copy全局变量b来使用,但不会改变外边的b a = b + [4,5,6] return a #输出:a=[1,2,3,4,5,6], b=[1,2,3] def func(b): #只是参数传递,不会改变外面的b b = [0] a = b + [4,5,6]
def train(self, pairs, labels, verbose=False, cache=None): """Train word2vec embeddings.""" self.doc2vec = Doc2Vec.load(self.model_cache) super().train(pairs, labels, verbose, cache)
def load(self, cache): """Load trained model.""" self.doc2vec = Doc2Vec.load(self.model_cache) super().load(cache)
def get_model(cls): """Get the model object for this instance.""" modelfile = glob.glob('/opt/ml/model/*.pkl')[0] return Doc2Vec.load(modelfile)#default model name of export.pkl
df = df[df.Solution.notnull()] df = df[df.Neutrality.notnull()] df = df[df.Localization.notnull()] df_x = df.loc[:, ['Comments']] headers.remove('Comments') headers = ["Mitigation"] df_y = df.loc[:, headers] df_y.head() df_y[df_y != 0] = 1 df_y = df_y.round(0).astype(int) df_y['new'] = 1 - df_y #load model model = Doc2Vec.load(os.path.join("trained", "comments2vec.d2v")) comments = [] for index, row in df.iterrows(): line = row["Comments"] line = re.sub("[^a-zA-Z?!]", " ", line) words = [ w.lower().decode('utf-8') for w in line.strip().split() if len(w) >= 3 ] comments.append(words) x_train = [] for comment in comments: feature_vec = model.infer_vector(comment) #feature_vec = np.append(feature_vec,len(comment)) x_train.append(feature_vec)
import os import random import logging import pandas as pd import numpy as np import pickle as pk from gensim.models.doc2vec import Doc2Vec, TaggedDocument from nltk import tokenize from zipfile import ZipFile #setting the working directory os.chdir('/home/jcai/geometry_of_law/') #loading the model model = Doc2Vec.load( '/home/jcai/geometry_of_law/doc2vec_v50k_d200_shuffled_opinion/ALL_opinion.d2v' ) #calculate and export similarity to "regulation", "Privacy", "Labor" list_of_issues = [ "criminal-appeals", "civil-rights", "first-admendment", "due-process", "privacy", "labor", "regulation" ] list_of_names = [ "criminal appeals", "civil rights", "first admendment", "due process", "privacy", "labor", "regulation" ] issue_dict = dict(zip(list_of_issues, list_of_names))
def pre_train_embedding(embed_type, pre_embed_model, train, test): # x열은 토크나이징 된 단어들 목록 # y열은 타겟 라벨 # counter vectorize, tf-idf용 corpus // 빈칸으로 띄어쓰기 train_corpus1 = [] # word2vec, doc2vec용 corpus # ,로 띄어쓰기와 리스트 형식 train_corpus2 = [] for words in train['x']: sentence1 = "" sentence2 = [] for word in words.split(","): sentence1 += word + " " sentence2.append(word) sentence1 = sentence1[:len(sentence1) - 1] train_corpus1.append(sentence1) train_corpus2.append(sentence2) # counter vectorize, tf-idf용 corpus // 빈칸으로 띄어쓰기 test_corpus1 = [] # word2vec, doc2vec용 corpus # ,로 띄어쓰기와 리스트 형식 test_corpus2 = [] for words in test['x']: sentence1 = "" sentence2 = [] for word in words.split(","): sentence1 += word + " " sentence2.append(word) sentence1 = sentence1[:len(sentence1) - 1] test_corpus1.append(sentence1) test_corpus2.append(sentence2) if embed_type == "CounterVector": start = time.time() count_vectorizer = load( open( "C:/Users/battl/PycharmProjects/cse_project/project list/Machine Learning Classification Model Visualization Web Service/embedding_model/" + pre_embed_model, "rb")) count_train_vectors = count_vectorizer.transform(train_corpus1) count_test_vectors = count_vectorizer.transform(test_corpus1) sparse_count_train_x = csr_matrix(count_train_vectors) sparse_count_test_x = csr_matrix(count_test_vectors) end = time.time() print('pre-train CounterVectorizer embedding time: {}'.format(end - start)) return sparse_count_train_x, sparse_count_test_x, train[ 'y'].values, test['y'].values elif embed_type == "TF-IDF": start = time.time() tfidf_vectorizer = load( open( "C:/Users/battl/PycharmProjects/cse_project/project list/Machine Learning Classification Model Visualization Web Service/embedding_model/" + pre_embed_model, "rb")) tf_train_vectors = tfidf_vectorizer.transform(train_corpus1) tf_test_vectors = tfidf_vectorizer.transform(test_corpus1) sparse_tf_train_x = csr_matrix(tf_train_vectors) sparse_tf_test_x = csr_matrix(tf_test_vectors) end = time.time() print('pre-train TfidfVectorizer embedding time: '.format(end - start)) return sparse_tf_train_x, sparse_tf_test_x, train['y'].values, test[ 'y'].values elif embed_type == "Doc2Vec": start = time.time() from collections import namedtuple TaggedDocument = namedtuple('TaggedDocument', 'words tags') doc2vec_train_tag = [ TaggedDocument(doc, tag) for doc, tag in zip(train_corpus2, train['y'].values) ] doc2vec_test_tag = [ TaggedDocument(doc, tag) for doc, tag in zip(test_corpus2, test['y'].values) ] from gensim.models.doc2vec import Doc2Vec doc_vectorizer = Doc2Vec.load( 'C:/Users/battl/PycharmProjects/cse_project/project list/Machine Learning Classification Model Visualization Web Service/embedding_model/' + pre_embed_model) for epoch in range(10): doc_vectorizer.train(doc2vec_train_tag, total_examples=doc_vectorizer.corpus_count, epochs=10) doc_vectorizer.alpha -= 0.002 # decrease the learning rate doc_vectorizer.min_alpha = doc_vectorizer.alpha # fix the learning rate, no decay doc_train_vectors = [ doc_vectorizer.infer_vector(doc.words) for doc in doc2vec_train_tag ] doc_train_tags = [doc.tags for doc in doc2vec_train_tag] doc_test_vectors = [ doc_vectorizer.infer_vector(doc.words) for doc in doc2vec_test_tag ] doc_test_tags = [doc.tags for doc in doc2vec_test_tag] import numpy as np doc_train_vectors_np = np.array(doc_train_vectors) doc_train_tags_np = np.array(doc_train_tags) doc_test_vectors_np = np.array(doc_test_vectors) doc_test_tags_np = np.array(doc_test_tags) sparse_doc_train_x = csr_matrix(doc_train_vectors_np) sparse_doc_test_x = csr_matrix(doc_test_vectors_np) end = time.time() print('pre-train Doc2Vec embedding time: {}'.format(end - start)) return sparse_doc_train_x, sparse_doc_test_x, doc_train_tags_np, doc_test_tags_np elif embed_type == "user_defined_embedding": pass
def load_model(self): model = Doc2Vec.load(self.model_loc) self.model = model
def most_similar(new_text): plt.style.use('ggplot') #Load the trained model model = Doc2Vec.load('doc2vec_abstracts') #Load the awards data awds = pd.read_csv('NSF CHE 2015.csv', encoding='latin-1') awds['StartDate'] = pd.to_datetime( awds['StartDate']).apply(lambda x: x.year) awds['EndDate'] = pd.to_datetime(awds['EndDate']) awds['AwardedAmountToDate'] = [ x.replace('$', '') for x in awds['AwardedAmountToDate'] ] awds['AwardedAmountToDate'] = [ x.replace(',', '') for x in awds['AwardedAmountToDate'] ] awds['AwardedAmountToDate'] = pd.to_numeric(awds['AwardedAmountToDate']) #Load the papers data sheet papers = pd.read_csv('che_paper_data.csv') papers['year'] = pd.to_datetime(papers['year']) papers['citations per year'] = papers['citations'].divide([ ((datetime.datetime.today() - x).days) / 365.2422 for x in papers['year'] ]) papers['year'] = papers['year'].apply(lambda x: x.year) #Here we build up and instantiate the stop words and lemmatizer stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() boiler_plate = 'This award reflects NSF' 's statutory mission and has been deemed worthy of support through evaluation using the Foundation' 's intellectual merit and broader impacts review criteria' #The function below cleans and tokenizes the input text def word_mod(doc): doc = re.sub('<.*?>', ' ', doc) doc = re.sub(boiler_plate, '', doc) punct_free = ''.join(ch for ch in doc if ch not in exclude) words = punct_free.lower().split() stop_free = " ".join([i for i in words if i not in stop]) lemm = " ".join(lemma.lemmatize(word) for word in stop_free.split()) word_list = lemm.split() # only take words which are greater than 2 characters cleaned = [word for word in word_list if len(word) > 2] return cleaned #Here the cleaned up text is fed to the model. The model returns the similiarty of this text to all awards #We print out the two most similar award numbers new_text_clean = model.infer_vector(word_mod(new_text)) sims = model.docvecs.most_similar([new_text_clean], topn=len(model.docvecs)) sim1 = sims[0] sim2 = sims[1] print( 'The most similar award numbers are {0} and {1}, with similarity scores of {2} and {3}.' .format(sim1[0], sim2[0], round(sim1[1], 3), round(sim2[1], 3))) #Here we examine the awards with similarity score greater than 0.5. It matches #with other awards made, the amount of the award, and the publication data #from each award. sims = [sims[i][0] for i in range(len(sims)) if sims[i][1] > 0.5] sim_awards = awds[awds['AwardNumber'].isin(sims)].copy() sim_papers = papers[papers['award number'].isin(sims)].copy() #Here plots for different data and metrics are generated. fig1 = plt.figure() sim_awards.groupby('StartDate')['AwardNumber'].count().plot.bar(rot=0) plt.title('Awards per Year Similar to Text') plt.ylabel('Number of Awards') plt.xlabel('Year of Award') plt.show() fig2 = plt.figure() sim_awards.groupby('StartDate')['AwardedAmountToDate'].sum().plot.bar( rot=0) plt.title('Total Awarded Dollars per Year for Awards Similar to Text') plt.ylabel('Total Dollars Awarded') plt.xlabel('Year of Award') plt.show() fig3 = plt.figure() sim_papers.groupby('year')['title'].count().plot.bar(rot=0) plt.title('Number of Publications Each Year from Awards Similar to Text') plt.ylabel('Number of Publications') plt.xlabel('Year of Publication') plt.show() fig4 = plt.figure() sim_papers.boxplot(column=['citations per year'], by='year') plt.title( 'Citations per Year For \n Publications from Awards Similar to Text') plt.suptitle("") plt.ylabel('Citations per Year') plt.xlabel('Year of Publication') plt.show()
import itertools import numpy as np import torch from torch.utils.data import Dataset import os import json from gensim.models.doc2vec import Doc2Vec from gensim.test.utils import common_texts from gensim.models.doc2vec import Doc2Vec, TaggedDocument documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)] model = Doc2Vec.load('./model') # model = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4) # from gensim.test.utils import get_tmpfile # # # fname = get_tmpfile("my_doc2vec_model") # # model.save(fname) # # model = Doc2Vec.load(fname) # you can continue training with the loaded model! # model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True) # vector = model.infer_vector(["0.11", "0.31"]) # print(vector) class IntegerSortDataset(Dataset): def __init__(self,
placeholder_model_name = 'doc2vec_size_{}_w_{}_type_{}_concat_{}_mean_{}_trainwords_{}_hs_{}_neg_{}_vocabsize_{}_model_{}'.format( DOC2VEC_SIZE, DOC2VEC_WINDOW, 'dm' if DOC2VEC_TYPE == 1 else 'pv-dbow', DOC2VEC_CONCAT, DOC2VEC_MEAN, DOC2VEC_TRAIN_WORDS, DOC2VEC_HIERARCHICAL_SAMPLE, DOC2VEC_NEGATIVE_SAMPLE_SIZE, str(DOC2VEC_MAX_VOCAB_SIZE), str(part_level) + '_' + part_name) GLOBAL_VARS.DOC2VEC_MODEL_NAME = placeholder_model_name placeholder_model_name = os.path.join(placeholder_model_name, "epoch_{}") epoch = DOC2VEC_EPOCH GLOBAL_VARS.MODEL_NAME = placeholder_model_name.format(epoch) info("Loading Doc2vec model: {}".format(GLOBAL_VARS.MODEL_NAME)) doc2vec_model = Doc2Vec.load(os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, MODEL_PREFIX), mmap=DOC2VEC_MMAP) info("Loading Validation Dict") validation_dict = dict( pickle.load( gzip.open( os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, VALIDATION_DICT + GZIP_EXTENSION)))) info("Loading Test Dict") test_dict = dict( pickle.load( gzip.open( os.path.join(doc2vec_model_save_location, GLOBAL_VARS.MODEL_NAME, TEST_DICT + GZIP_EXTENSION))))
out_df = pd.DataFrame(static_dict, columns=static_dict.keys()) out_df.to_csv('/users/votta/code/penn_apps/similarity.csv') out_df.to_parquet('/users/votta/code/penn_apps/similarity.parquet.gzip') if __name__ == '__main__': """the rest of the model building happens here""" # scrape_all() df = pd.read_pickle("./test.pkl") docs = generate_tagged_docs(df) # model = Doc2Vec(docs, vector_size=30, window=5, min_count=2, workers=4) # model.save(D2V_MODEL_NAME) model = Doc2Vec.load(D2V_MODEL_NAME) # print_random_similarity(df, docs, model) generate_lookup_table(df, model) # Experiment with SEC data # # Initialize a downloader instance. # # If no argument is passed to the constructor, the package # # will attempt to locate the user's downloads folder. # dl = Downloader('/users/votta/code/penn_apps') # # Get all 8-K filings for Apple (ticker: AAPL) # dl.get_8k_filings("AAPL") # # Get all 8-K filings for Apple, including filing amends (8-K/A) # dl.get_8k_filings("AAPL", include_amends=True) # # Get all 8-K filings for Apple before March 25, 2017
def _d2v_vector(request, model_dir): model = Doc2Vec.load(model_dir) vector = model.infer_vector(request) return vector
from pyspark import SparkContext from pyspark.sql import SQLContext from gensim.models.doc2vec import Doc2Vec sc = SparkContext() sqlContext = SQLContext(sc) # this is a large object we cache it on each worker node gmod_broadcast = sc.broadcast( Doc2Vec.load("/root/doc2vec/doc2vec_model/hn") ) df = sqlContext.read.load("hdfs:///hndata/parquet_typed", format="parquet") ids = df.where("score IS NOT NULL") \ .where("type='story'") \ .where("title IS NOT NULL") \ .map(lambda row: row.id) def mergeVec(id): gmod = gmod_broadcast.value vec = gmod.docvecs["TITLE_%d" % id] return (id, vec) docvecs = ids.map(mergeVec) docvecs.saveAsPickleFile("hdfs:///hndata/docvecs_glove_pickle")
if __name__ == '__main__': global model #----------- Parsing Arguments --------------- p = argparse.ArgumentParser() p.add_argument("--model", help="Path to the trained model") p.add_argument("--binary", help="Specifies the loaded model is binary") p.add_argument("--host", help="Host name (default: localhost)") p.add_argument("--port", help="Port (default: 5000)") p.add_argument("--path", help="Path (default: /word2vec)") args = p.parse_args() model_path = args.model if args.model else "./model.bin.gz" binary = True if args.binary else False host = args.host if args.host else "localhost" path = args.path if args.path else "/word2vec" port = int(args.port) if args.port else 5000 if not args.model: print "Usage: word2vec-apy.py --model path/to/the/model [--host host --port 1234]" model = w.load(model_path) api.add_resource(N_Similarity, path + '/n_similarity') api.add_resource(Similarity, path + '/similarity') api.add_resource(MostSimilar, path + '/most_similar') api.add_resource(Model, path + '/model') api.add_resource(Infer, path + '/infer') api.add_resource(ModelWordSet, '/word2vec/model_word_set') app.run(host=host, port=port)
#!/usr/bin/python # -*- coding: UTF-8 -*- from gensim.models.doc2vec import Doc2Vec import pykeyvi docvecs_process_input_keyvi_index_file = "docvecs_urlid_url.kv" output_data_path = "/raid/ankit/doc2vec/out_s_p_1M" doc2vec_trained_model = 'pages_with_spaces.doc2vec' _alpha, _min_alpha, _passes = (0.020, 0.001, 20) print "Loading keyvi dictionaries ..." keyvi_dict=pykeyvi.Dictionary("{}/{}".format(output_data_path, docvecs_process_input_keyvi_index_file)) print "Finished Loading key-vi Dictionary." print "Loading Doc2Vec Model ... " model = Doc2Vec.load("{}/{}".format(output_data_path, doc2vec_trained_model)) print "Model Loaded Successfully!" def get_similar_urls(sample_query, nearest_num): tokens = sample_query.lower().split() dv = model.infer_vector(tokens, alpha=_alpha, min_alpha=_min_alpha, steps=_passes) # note: may want to use many more steps than default sims = model.docvecs.most_similar(positive=[dv], topn=nearest_num) for url_id, distance in sims: url = "" for m in keyvi_dict.Get(str(url_id)): url = m.GetValueAsString() print "{}\t{}\t{}".format(url_id, url, distance) def main(): print "\nSimilar URLS for Queries - Doc2Vec Retrieval Interface [All URL's]"
text = text.split() return text r_data_loaded = random.sample(data_loaded, len(data_loaded)) r_samples = r_data_loaded[:100000] df = pd.DataFrame(r_samples) train_df = df # Prepare embedding vocabulary = dict() inverse_vocabulary = [ '<unk>' ] # '<unk>' will never be used, it is only a placeholder for the [0, 0, ....0] embedding word2vec_cfg = Doc2Vec.load(CFG_EMBEDDING_FILE) word2vec = KeyedVectors.load(SOURCE_CODE_EMBEDDING_FILE) # gensim.models.Word2Vec.load_word2vec_format('/data5/momo-projects/user_interest_classification/code/word2vec/vectors_groups_1105.bin', binary=True, unicode_errors='ignore') code_clones_cols = ['code_clone1', 'code_clone2'] # Iterate over the questions only of both training and test datasets for dataset in [train_df]: for index, row in dataset.iterrows(): # Iterate through the text of both questions of the row for code_clone in code_clones_cols: q2n = [] # q2n -> question numbers representation for word in source_code_to_tokens(row[code_clone]): # Check for unwanted words
def load(cls, model_file='synset2vec'): model = Doc2Vec.load(model_file) return cls(model)
f = open('pckl_df_problemen_16.pkl', 'rb') problemen = pickle.load(f) f.close() print('LSTVRZ ophalen uit pickle bestand . . .') f = open('pckl_LSTVRZ_16.pkl', 'rb') lstVRZ = pickle.load(f) f.close() print('LSTVRZID ophalen uit pickle bestand . . .') f = open('pckl_LSTVRZID_16.pkl', 'rb') lstVRZID = pickle.load(f) f.close() print('Laad het KOD model . . . ') modelKOD = Doc2Vec.load("KOD DOC2VEC PROBLEMS 6 MAAND_16.model") print('Laad het VRZ model . . . ') modelVRZ = Doc2Vec.load("VRZ DOC2VEC PROBLEMS 6 MAAND_16.model") # We hebben de basis voor de analyse. # Hierbij gaan we ervanuit dat de incidenten en de problemen die # zijn aangeleverd betrekking hebben op de periode van het halfjaar # Gegevensverzamelingen hebben nu de vorm # incidenten # 1. Incidentnummer # 2. Korte omschrijving (Details) # 3. Verzoek # 4. LSTVRZ # 5. VRZ # 6. LSTKOD
TEST_INPUT_DATA = 'test_input.npy' DATA_CONFIGS = 'data_configs.json' SEQ_CONFIGS = 'seq_configs_bt.json' # Train label save file name TRAIN_LABEL_DATA = 'train_label.npy' TRAIN_LABEL_SMALL = 'train_label_small.npy' TEST_LABEL_DATA = 'test_label.npy' TEST_LABEL_SMALL = 'test_label_small.npy' # pre-trained model load d2v_model_name = './model_save/embedding_model/Doc2vec_new.model' w2v_model_name = './model_save/embedding_model/Word2vec1.model' pre_trained_name = './model_save/embedding_model/trained_word2vec1.model' doc_vectorizer = Doc2Vec.load(d2v_model_name) word_vectorizer = Word2Vec.load(w2v_model_name) pre_trained_w2v = Word2Vec.load(pre_trained_name) train_X = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb')) test_X = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb')) if label_size == 'big': train_Y = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb')) train_YS = tf.one_hot(train_Y, 43) test_Y = np.load(open(DATA_IN_PATH + TEST_LABEL_DATA, 'rb')) test_YS = tf.one_hot(test_Y, 43) else: train_Y = np.load(open(DATA_IN_PATH + TRAIN_LABEL_SMALL, 'rb')) train_YS = tf.one_hot(train_Y, 455) test_Y = np.load(open(DATA_IN_PATH + TEST_LABEL_SMALL, 'rb'))
n closest subject_ids """ s = distance_mat.loc[subject_id] closest = s.sort_values()[1:1+n] return closest if __name__ == '__main__': parser = argparse.ArgumentParser(description="produce similarity matrix") parser.add_argument('dbname', help="Database name") parser.add_argument('path_to_model', help="Model to test") parser.add_argument('n_closest', help="How many closest subjects to look into") args = parser.parse_args() model = Doc2Vec.load(args.path_to_model) subject_hash = get_subject_hash(args.dbname) subject_ids = list(subject_hash.keys()) # loop over subjects and average docvecs belonging to subject. # place in dictionary subject_vectors = get_subject_vectors(subject_ids) distance_mat = get_distance_mat(subject_vectors) to_csv = [] for subj_id in subject_ids: relateds = get_n_closest(distance_mat, subj_id, n=int(args.n_closest)) for related_id, dist in relateds.iteritems(): weight = round(1./dist) #weight = round((1-dist) * 10)
model_DM.train(training_doc) model_DBOW.train(training_doc) # Save the trained models: fout = 'DM.d2v' model_DM.save(most_recent + fout) model_DM.init_sims(replace=True) fout = 'DBOW.d2v' model_DBOW.init_sims(replace=True) model_DBOW.save(most_recent + fout) else: # Load Doc2Vec model from disk: fout = 'DM.d2v' model_DM = Doc2Vec.load(most_recent + fout) fout = 'DBOW.d2v' model_DBOW = Doc2Vec.load(most_recent + fout) # train the two different methods of the Doc2Vec algorithm: # NB DBOW is more similar to the recommended skip-gram of # Word2Vec by the original paper's authors. print('nonmatch', model_DM.doesnt_match("delay government flooding lightning".split())) print('nonmatch', model_DM.doesnt_match("euref voteout remain lightning".split())) print('euref sim by word', model_DM.similar_by_word('euref')) print('flood ', model_DM.similar_by_word('flood'))
if deps_model_file != "": has_deps_embeddings = True logging.info("Loading dependency embeddings from %s" % deps_model_file) deps_model = Embeddings.load(deps_model_file+".npy", deps_model_file+".vocab") logging.info("Deps Model loaded!") #deps_vocabulary = deps_model._vocab #deps_embeddings = deps_model._vecs # Load Models here is_doc2vec_model = False # load word2vec word2vec_model if doc2vec_model_file != '': model = Doc2Vec.load(doc2vec_model_file) is_doc2vec_model = True else: if word2vec_load_bin: model = Word2Vec.load_word2vec_format(word2vec_model_file, binary=True) # use this for google vectors else: model = Word2Vec.load(word2vec_model_file) use_id_for_vector = use_id_for_vector and is_doc2vec_model word2vec_num_features = len(model.syn0[0]) logging.info("Embeddings feature vectors length:%s" % word2vec_num_features) logging.info("Model syn0 len=%d" % (len(model.syn0))) # define classes class_mapping = dict([(val, idx) for idx, val in enumerate(valid_senses)])
elif i >= 500: total_count_total+=1 posInfo.sort() total_pos_tags = list(set([pos for sent in posInfo for pos in sent])) print [pos+"."+str(i) for i,pos in enumerate(total_pos_tags)] pos_tag_vector = [] for pos in total_pos_tags: pos_tag_vector.append([1 if p.count(pos)>0 else 0 for p in posInfo]) for i,pos in enumerate(total_pos_tags): dtm = base.cbind(dtm,pos=pos_tag_vector[i]) print vb_count,vb_count_total, total_count, total_count_total # dtm_syntax = base.cbind(dtm,class_label=problem_class_labels) # waikatoWriteArff(base.data_frame(dtm_syntax),file="problem_syntax.arff",class_col="class_label") print "doc2vec" doc2vecVectors=[] doc2vecModel = Doc2Vec.load("/home/kh562/Corpora/MODELS/acl_sent_doc2vec.model") for s in problem_strings+non_problem_strings: doc2vecVectors.append(doc2vecModel.infer_vector(s.split())) for i in range(0,len(doc2vecVectors[0])): dtm = base.cbind(dtm,doc2vec=list(float(docVec[i]) for docVec in doc2vecVectors)) # dtm_doc2vec = base.cbind(dtm,class_label=problem_class_labels) # waikatoWriteArff(base.data_frame(dtm_doc2vec),file="problem_doc2vec.arff",class_col="class_label") print "word2vec" word2vec_model = Word2Vec.load("/home/kh562/Corpora/MODELS/fuse_word2vec.model") word2vec_vector = [] for [head,pos] in problem_heads+non_problem_heads: try: word2vec_vector.append(word2vec_model[head]) except: word2vec_vector.append(np.array([0]*100,dtype=np.float32))
# model.build_vocab(tuples_list) # # for epoch in range(20): # print('iteration {0}'.format(epoch)) # model.train(tuples_list, # total_examples=model.corpus_count, # epochs=model.iter) # # decrease the learning rate # model.alpha -= 0.002 # # fix the learning rate, no decay # model.min_alpha = model.alpha # # model.save("d2v.model") # print("Model Saved") model = Doc2Vec.load("d2v.model") # Get sentences embeddings of train and test data docs_sentence_embeddings = np.zeros((len(docs_list), 50)) for i in range(len(docs_sentence_embeddings)): docs_sentence_embeddings[i] = model.docvecs[str(i)] print(docs_sentence_embeddings.shape) # x_train, x_test, y_train, y_test = train_test_split(docs_sentence_embeddings, labels, test_size=0.20, random_state=1) # # Train model
from gensim.models.doc2vec import Doc2Vec from scipy.spatial import distance def calculate_cosine_similarity(u, v): return 1 - distance.cosine(u, v) def predict(model, item1, item2, threshold=0.5): vec1 = model.docvecs[item1] vec2 = model.docvecs[item2] return int(calculate_cosine_similarity(vec1, vec2) > threshold) vector_size = '100-better-data-window-9' model = Doc2Vec.load(f'models/doc2vec-{vector_size}.model') threshold = 0.7 df_test = pd.read_csv('data/valid.csv') df_test['temp_ltable_id'] = 'A_' + df_test['ltable_id'].astype(str) df_test['temp_rtable_id'] = 'B_' + df_test['rtable_id'].astype(str) df_test['label'] = df_test.apply(lambda row: predict(model, row[ 'temp_ltable_id'], row['temp_rtable_id'], threshold), axis=1) df_test = df_test[['ltable_id', 'rtable_id', 'label']] df_test.to_csv(f'results/submission-{vector_size}.csv', index=False)
def __init__(self): self.model=Doc2Vec.load('./model.d2v') self.st = LancasterStemmer()
import pandas as pd import numpy as np import sanalytics.algorithms.utils as sau from gensim.models.doc2vec import Doc2Vec from progressbar import progressbar import re import random from glob import glob print("READING D2V") ## Read D2V Model d2v = Doc2Vec.load("datasets/rq3_d2v/sec1.0R100.model") ## Read D2V Model All d2vall = Doc2Vec.load("datasets/rq3_d2v/sec1.0R100_all.model") print("LOADED D2V") ## Read files while True: files = list(os.walk("datasets/rq3_dataR100"))[0][2] filename = random.sample(files,1)[0] if filename in set([".".join(i.split(".")[:-1]) for i in list(os.walk("datasets/rq3_vecdata_newR100"))[0][2]]): continue print("start {}".format(filename)) X = pd.read_parquet("datasets/rq3_dataR100/{}".format(filename)) if "all" not in filename: X["d2v"] = [d2v.infer_vector("{} {} {}".format(i.title, i.question, i.answers).split()) for i in progressbar(X.itertuples())] if "all" in filename:
import jieba import gensim import pandas as pd from gensim.models.doc2vec import Doc2Vec from Segmentation import * def similarity(a_vect, b_vect): #计算两个向量余弦值 dot_val = 0.0 a_norm = 0.0 b_norm = 0.0 cos = None for a, b in zip(a_vect, b_vect): dot_val += a*b a_norm += a**2 b_norm += b**2 if a_norm == 0.0 or b_norm == 0.0: cos = -1 else: cos = dot_val / ((a_norm*b_norm)**0.5) return cos model = Doc2Vec.load(sys.path[0]+'/model/modeltest',mmap='r') #推测文本的向量 #model.random.seed(0) Vector1 = model.infer_vector(['新华社','报道','出现','偏差'],steps=500,alpha=0.025) Vector2 = model.infer_vector(['新华社','的', '报道','出现','错误'],steps=500,alpha=0.025) Vector3 = model.infer_vector(['今天','的','天气','非常','好'],steps=500,alpha=0.025) print(similarity(Vector1,Vector2)) print(similarity(Vector1,Vector3))
def get_distances_subset(n_closest, category_hash_with_doc_ids, csv_path): # example # category_hash_with_doc_ids = {"cat1":["us-1", "us-2"], "cat2": ["us-3"]} # loop over subjects and average docvecs belonging to subject. # place in dictionary model = Doc2Vec.load('../doc2vec_model') cpc_vectors = get_category_vectors_subset(model, category_hash_with_doc_ids) distance_mat = get_distance_mat(cpc_vectors) to_csv = [] for subj_id in list(category_hash_with_doc_ids.keys()): relateds = get_n_closest(distance_mat, subj_id, n=n_closest) for related_id, dist in relateds.iteritems(): weight = round(1./dist) #weight = round((1-dist) * 10) row = (subj_id, related_id, weight, subj_id, related_id) to_csv.append(row) edges = pd.DataFrame(to_csv, columns=['source', 'target', 'weight', 'source_name', 'target_name']) edges.to_csv(csv_path, index=False) if __name__ == '__main__': print "main" model = Doc2Vec.load('../doc2vec_model') # db = MongoClient() # get_distances(db, model, int(sys.argv[1])) get_distances_subset(model, 5, data, '../static/subject_distances1.csv')
def mkExistingTrainedModel(path): return Doc2Vec.load(path)
def kaifang(self): sen = self.get_input() vec = self.sen2vec(self.model, sen) siml = self.sim(vec, self.num, self.ids) print(siml) siml_2 = [] for sim in siml: siml_2.append(str(100/(100+sim[0]))+'\n'+self.ana2print(self.ids, sim[1]+1)) return siml_2 if __name__ == '__main__': out_1 = open('output_1.txt', 'w', encoding='utf-8') out_2 = open('output_2.txt', 'w', encoding='utf-8') # 读取训练好的doc2vec模型 mod = Doc2Vec.load( 'model_4.0.1.md') # 读取input文件,寻找最接近的方剂 # Disease(输入文件, doc2vec模型, 输出的个数) # 返回list[编号 相似度 描述] zd = Disease('input.txt', mod, 5) for line in zd.output(): print(line) out_1.write('Score: ' + str(line[1]) + '\n') out_1.write(line[2]+'\n') out_1.write('\n') # 对输入进行归类 NaiveBayesPredict(输入文件, 预先训练好的朴素贝叶斯模型即概率矩阵) # 返回其分类的id(目前在2-23之间) nbp = NaiveBayesPredict('input.txt', 'result.model') classify = int(nbp.predict())