def gene(): modelpre=Word2Vec.load('corpus/pretrain40.model') modelfield=Word2Vec.load('corpus/fieldtrained40.model') modelmerged=Word2Vec.load('corpus/mergedtrained40iter1.model') xlist=[] ylist=[] zlist=[] labellist=[] upperline=0.016 floor=0.008 #0.01 0.013 upperlinefield=0.06 floorfield=0.02 upperlinepre=0.019 floorpre=0.018 with open('corpus/word2pic2.txt') as fp: for row in fp: word=unicode(row[:-1]) x=(modelmerged.similarity(word,u"好")+modelmerged.similarity(word,u"快乐")+modelmerged.similarity(word,u"开心"))/3.0-(modelmerged.similarity(word,u"坏")+modelmerged.similarity(word,u"悲伤"))/2.0 y=(modelfield.similarity(word,u"好")+modelfield.similarity(word,u"快乐")+modelfield.similarity(word,u"开心"))/3.0-(modelfield.similarity(word,u"坏")+modelfield.similarity(word,u"悲伤"))/2.0 z=(modelpre.similarity(word,u"好")+modelpre.similarity(word,u"快乐")+modelpre.similarity(word,u"开心"))/3.0-(modelpre.similarity(word,u"坏")+modelpre.similarity(word,u"悲伤"))/2.0 labellist.append(word) # xlist.append(x-(upperline+floor)/2.0) xlist.append(x-0.016) ylist.append(y-(upperlinefield+floorfield)/2.0) zlist.append(z-(upperlinepre+floorpre)/2.0) # with open('corpus/word2picxyz.txt','w') as fp: # pickle.dump(labellist,xlist,ylist,zlist,fp) return labellist,xlist,ylist,zlist
def dis(vectorsize): # print model.similarity("今天","在") model=Word2Vec.load('corpus/mergedtrained'+str(vectorsize)+'iter1'+'.model') modelfield=Word2Vec.load('corpus/fieldtrained'+str(vectorsize)+'.model') print model.similarity(u"分手",u"好") print model.similarity(u"分手",u"坏") print modelfield.similarity(u"分手",u"好") print modelfield.similarity(u"分手",u"坏")
def main(): # te() # teword() # intersect(40) # setwordwindow(40) # Word2Vec.load_word2vec_format('corpus/initindex40',binary=False) modelpre=Word2Vec.load('corpus/pretrain40.model') modelfield=Word2Vec.load('corpus/fieldtrained40.model') modelmerged=Word2Vec.load('corpus/mergedtrained40iter1.model') print 'finish load' classify(modelpre,modelfield,modelmerged,40)
def create_partition_function(self, f_w2v, f_h5): print "Building the partition function" # Load the model from disk M = Word2Vec.load(f_w2v) words = M.index2word ZT = [] INPUT_ITR = tqdm.tqdm(words) # Compute the partition function for each word for w in INPUT_ITR: UE = self.energy(M.syn0, M[w]) z = compute_partition_stats(UE) ZT.append(z) # Save the partition function to disk # (special care needed for h5py unicode strings) dt = h5py.special_dtype(vlen=unicode) with h5py.File(f_h5,'w') as h5: h5.create_dataset("words", (len(words),), dtype=dt, data=[w.encode('utf8') for w in words]) h5.attrs['vocab_N'] = len(words) h5['Z'] = ZT
def main(): industry = sys.argv[1] vocab_file = "../data/" + industry + "/embed_vocab" model_file = "../data/" + industry + "/user_model" # load vocab list with open(vocab_file) as f: vocab_list = map(str.strip, f.readlines()) # load model model = Word2Vec.load(model_file) # build vocab index dict vob_index_dict = {} for i, vob in enumerate(vocab_list): vob_index_dict[vob] = i # calc vocab dist logging.info("calucating vocab dist matrix") dm = get_vocab_dist_matrix(vocab_list, model) # get company domain list dict comp_domain_file = "../data/" + industry + "/company_file" comp_dict = get_comp_dict(comp_domain_file) logging.info("company dict generated : " + str(comp_dict.keys())) # delete domain not exist in vocab list filter_company_by_vocab(comp_dict, vocab_list) # filter company domain by uv : default uv > 100 filter_action_by_uv(comp_dict, 100) # calc dist between two company res_file = "../data/" + industry + "/company_dist" calc_company_dist(res_file, comp_dict, dm, vob_index_dict)
def __init__(self): ''' Training parameters: ''' self.w2v_dim=100 self.num_feature=400 self.batch_size=16 self.num_epoch=30 # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True) self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model') self.index2word_set = set(self.w2v_model.index2word) #self.bigram=None #self.trigram=None self.bigram=Phrases.load('./data/bigram.dat') self.trigram=Phrases.load('./data/trigram.dat') print('Build model...') self.model = Sequential() self.model.add(Dropout(0.2,input_shape=(self.num_feature,))) self.model.add(Dense(3, input_dim=self.num_feature, init='orthogonal')) self.model.add(Activation('softmax')) self.model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode="categorical") print('Model has been built!')
def initialize(self): sys.stdout.write("Metric initialization\n") sys.stdout.write("1 - Word2vec model") self.model = Word2Vec.load(model_path) sys.stdout.write("...loaded\n") sys.stdout.write("2 - Stop words") self.stop_words = [line.strip('\n') for line in open(stop_words_path)] sys.stdout.write("...loaded\n") sys.stdout.write("3 - Word-Averages model: ") self.wordAverages = defaultdict() for i in self.files_list: sys.stdout.write(str(i) + " - ") sys.stdout.flush() tweetsFile = tweets_path + str(i) + ".csv" wAvgsFile = wAvgs_path + str(i) + ".csv" tweets = [] values = [] with open(tweetsFile, 'r') as f1: tweets = f1.readlines() f1.close() with open(wAvgsFile, 'r') as f2: reader = csv.reader(f2) for r in reader: values.append( np.array([ float(v) for v in r ]) ) f2.close() for j in range(len(tweets)): self.wordAverages[ tweets[j].strip('\n') ] = values[j] sys.stdout.write("loaded\n")
def get_predict_vecs(words): n_dim = 300 imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl') #imdb_w2v.train(words) train_vecs = buildWordVector(words, n_dim,imdb_w2v) #print train_vecs.shape return train_vecs
def load_external(self, model_file_name): """ load a word2vec model from the file specified :param model_file_name: name of the model file :return: """ self.model = Word2Vec.load(model_file_name)
def __init__(self,*args,**kwargs): super(affinity_mapping, self).__init__(*args,**kwargs) # Load the model from disk self.M = Word2Vec.load(kwargs["f_w2v"]) self.shape = self.M.syn0.shape # Set parallel option self._PARALLEL = ast.literal_eval(kwargs["_PARALLEL"]) self.damping = float(kwargs["damping"]) if not os.path.exists(kwargs["f_affinity"]): h5 = h5py.File(kwargs["f_affinity"],'w') h5.close() self.h5 = h5py.File(kwargs["f_affinity"],'r+') global damping, M damping = self.damping M = self.M self.vocab_n = len(M.index2word) M.word2index = dict([(w,i) for w,i in zip(M.index2word,range(self.vocab_n))]) # Increment this as we find more clusters self.cluster_n = 0
def build_word_graph(model_fname, limiar=0.2): """ Constroi um grafo de walavras ponderado pela similaridade entre elas de acordo com o modelo. :param model_fname: Nome do arquivo com o modelo word2vec como foi salvo :return: objeto grafo """ m = Word2Vec.load(model_fname) g = Graph() freq = g.new_vertex_property("int") weight = g.new_edge_property("float") i = 0 vdict = {} for w1, w2 in combinations(m.vocab.keys(), 2): if w1 == '' or w2 == '': continue # print(w1,w2) v1 = g.add_vertex() if w1 not in vdict else vdict[w1] vdict[w1] = v1 freq[v1] = m.vocab[w1].count v2 = g.add_vertex() if w2 not in vdict else vdict[w2] vdict[w2] = v2 freq[v2] = m.vocab[w2].count sim = m.similarity(w1, w2) if sim > 0.1: e = g.add_edge(v1, v2) weight[e] = sim if i > 10000: break i += 1 g.vertex_properties['freq'] = freq g.edge_properties['sim'] = weight return g
def __init__(self, word2vec_path=""): self.sentence = [] self.tfidf_sparse = [] self.bi_set = [-1 for i in range(1000000)] self.tfidf_model_dict = {} if word2vec_path != "": self.word2vec_model = Word2Vec.load(word2vec_path)
def vectorize(model_file, dictionary_file, corpus_file): seterr(all='raise') # don't ignore numpy errors #load model from given file model = Word2Vec.load(model_file) dictionary = corpora.Dictionary().load(dictionary_file) corpus = corpora.MmCorpus(corpus_file) tfidf = models.TfidfModel(corpus) d = corpora.Dictionary() d = d.load(dictionary_file) corpus = corpora.MmCorpus(corpus_file) tf = models.TfidfModel(corpus) vectorize = [] for doc_no, tdoc in enumerate(tf[corpus]): tdoc.sort(key=lambda kv: kv[1], reverse=True) if doc_no % 100 == 0: logger.info("PROGRESS: vectorizing user #%i of %i" % (doc_no, len(corpus))) words_per_user = 8 word_vecs = [] for wordid, measure in tdoc: word = d[wordid] if word in model: word_vecs.append(model[word]) print word if len(word_vecs)>=words_per_user: break if len(word_vecs)==words_per_user: avg = matutils.unitvec(array(word_vecs).mean(axis=0)).astype(REAL) vectorize.append(avg) #print [word for word, measure in model.most_similar_from_array(avg, topn=5)] return vectorize
def term_expansion(fpath, terms, knn): '''Expand term list by creating list of nearest neighbors in provided embeddings representation. This is usually very noisy and there is a fuzzy distinction between semantic similarity and "relatedness". Bacteria names, for example, often neighbor diseases caused by those organisms. ''' model = Word2Vec.load(fpath) model.init_sims() nbrs = NearestNeighbors(n_neighbors=knn+1, algorithm='ball_tree', metric='l2') nbrs.fit(model.syn0norm) expansion = [] for phrase in terms: # space replaced with underscore in PMC/PubMed embeddings phrase = phrase.replace(" ","_") if phrase not in model.vocab: continue idx = model.vocab[phrase].index vec = model.syn0norm[idx] _,indices = nbrs.kneighbors(vec) neighbors = [model.index2word[j] for j in indices.flatten()] neighbors.remove(phrase) expansion += neighbors # transform words back to whitespace separators return map(lambda x:x.replace("_"," "), expansion)
def __init__(self,*args,**kwargs): super(generic_document_score, self).__init__(*args,**kwargs) f_w2v = os.path.join( kwargs["embedding"]["output_data_directory"], kwargs["embedding"]["w2v_embedding"]["f_db"], ) # Load the model from disk self.M = Word2Vec.load(f_w2v) self.shape = self.M.syn0.shape # Build the dictionary vocab_n = self.shape[0] self.word2index = dict(zip(self.M.index2word,range(vocab_n))) # Set parallel option (currently does nothing) self._PARALLEL = kwargs["_PARALLEL"] # Load the negative weights if "negative_weights" in kwargs: neg_W = kwargs["negative_weights"] self.neg_W = dict((k, float(v)) for k,v in neg_W.items()) else: self.neg_W = {}
def query_word_similarity(model_file, word1, word2): seterr(all='raise') # don't ignore numpy errors #load model from given file model = Word2Vec.load(model_file + '.model') similarity = model.similarity(word1,word2) logging.info("similarity of \'%s\' and \'%s\' is %f" % (word1,word2,similarity))
def __init__(self, *args, **kwargs): ''' Computes various measures of central tendency of a document. For Z_X scores, the raw word tokens are summed over the partition function. For I_X scores, the same statistics are computed over the similarity of all word pairs for words with top 10% Z values. This will precompute the partition function if it doesn't exist. ''' cfg_embed = kwargs["embedding"] cfg_score = kwargs["score"] f_w2v = os.path.join( cfg_embed["output_data_directory"], cfg_embed["w2v_embedding"]["f_db"], ) f_partition_function = os.path.join( cfg_embed["output_data_directory"], cfg_score["document_log_probability"]["f_partition_function"], ) if not os.path.exists(f_partition_function): self.create_partition_function(f_w2v, f_partition_function) self.Z = self.load_partition_function(f_partition_function) self.scores = [] val = cfg_score["document_log_probability"]["intra_document_cutoff"] self.intra_document_cutoff = float(val) self.model = Word2Vec.load(f_w2v)
def main(): parser = argparse.ArgumentParser( description='Python Word2Vec Cluster') parser.add_argument('model', action='store', help='Name of word2vec binary modelfile.') parser.add_argument('-o', '--out', action='store', default='model.pkl', help='Set output filename.') parser.add_argument('-k', '--K', action='store', type=int, default=500, help='Num of classes on KMeans.') parser.add_argument('-p', '--pre-trained-model', action='store', default=None, help='Use pre-trained KMeans Model.') parser.add_argument('-w', '--words-to-pred', action='store', nargs='+', type=str, default=None, help='List of word to predict.') args = parser.parse_args() model = Word2Vec.load(args.model) if not args.pre_trained_model: X = make_dataset(model) classifier = train(X, args.K) joblib.dump(classifier, args.out) reduced = reduce_dems(X) plot(classifier, reduced) else: classifier = joblib.load(args.pre_trained_model) if args.words_to_pred: X = [model[word] for word in args.words_to_pred if word in model] classes = classifier.predict(X) result = [] i = 0 for word in args.words_to_pred: if word in model: result.append(str(classes[i])) i += 1 else: result.append(str(-1)) print(' '.join(result))
def wordclasscification(): model=Word2Vec.load('corpus/mergedtrained40iter1.model') modelfield=Word2Vec.load('corpus/fieldtrained40.model') modelpre=Word2Vec.load('corpus/pretrain40.model') # wordlist=[u"喝酒",u"竞赛",u"原生",u"警察",u"离婚",u"单身"] with open('corpus/wordlabelcorpuslarge.txt') as fp: with open('corpus/wordneulabelsepe3','w') as file: for i in fp: # print i[:-1] try: word=unicode(i[:-1]) upperline=0.016 floor=0.008 #0.01 0.013 upperlinefield=0.06 floorfield=0.02 upperlinepre=0.019 floorpre=0.018 try: sub=(model.similarity(word,u"好")+model.similarity(word,u"快乐")+model.similarity(word,u"开心"))/3.0-(model.similarity(word,u"坏")+model.similarity(word,u"悲伤"))/2.0 if sub>upperline: modellabel=1 elif sub<floor: modellabel=-1 else: modellabel=0 sub=(modelfield.similarity(word,u"好")+modelfield.similarity(word,u"快乐")+modelfield.similarity(word,u"开心"))/3.0-(modelfield.similarity(word,u"坏")+modelfield.similarity(word,u"悲伤"))/2.0 if sub>upperlinefield: modelfieldlabel=1 elif sub<floorfield: modelfieldlabel=-1 else: modelfieldlabel=0 sub= (modelpre.similarity(word,u"好")+modelpre.similarity(word,u"快乐")+modelpre.similarity(word,u"开心"))/3.0-(modelpre.similarity(word,u"坏")+modelpre.similarity(word,u"悲伤"))/2.0 if sub>upperlinepre: modelprelabel=1 elif sub<floorpre: modelprelabel=-1 else: modelprelabel=0 file.write(i[:-1]+' '+str(modellabel)+' '+str(modelfieldlabel)+' '+str(modelprelabel)+'\n') except KeyError: print 'no key' continue except UnicodeDecodeError: print 'unicode error' continue
def main(): # te() # teword() # intersect(40) # setwordwindow(40) # Word2Vec.load_word2vec_format('corpus/initindex40',binary=False) model=Word2Vec.load('corpus/mergedtrained40iter1.model') dis(model)
def intersect(vectorsize): model=Word2Vec.load('corpus/fieldtrained'+str(vectorsize)+'.model') # setwordwindow(vectorsize) print 'finish load' Word2Vec.intersect_word2vec_format(model,'corpus/initindex'+str(vectorsize),binary=False) print 'finish intersect' model.save('corpus/merged'+str(vectorsize)+'.model') model.save_word2vec_format('corpus/merged'+str(vectorsize), binary=False) print 'finish save'
def fieldtrain(vectorsize): model=Word2Vec.load('corpus/pretrain'+str(vectorsize)+'.model') print 'finish load' sentences=LineSentence('corpus/fieldcorpus') model.train(sentences) print 'finish fieldtrain' model.save('corpus/fieldtrained'+str(vectorsize)+'.model') model.save_word2vec_format('corpus/fieldtrained'+str(vectorsize), binary=False) print 'finish save'
def __init__(self, num_topics, window_size, dim_size, model_folder='../Data/models'): models_file_template = model_folder+"/{model}_{run_id}.{filetype}" self._run_id = "K{topics}_W{window}_D{dims}".format(topics=num_topics, window=window_size, dims=dim_size) w2v_filename = models_file_template.format(model='w2v', run_id=self._run_id, filetype='gensim') gmm_filename = models_file_template.format(model='gmm', run_id=self._run_id, filetype='pkl') self._w2v_model = Word2Vec.load(w2v_filename) self._gmm_model = joblib.load(gmm_filename) self.index2word = self._w2v_model.index2word
def main2(): bow = cPickle.load(open('data/bow.pkl')) M = 10 m = Word2Vec.load('data/word2vecmodels/model%d.mm' % (M)) word_to_vec = np.array([m[bow[i]] for i in xrange(len(bow))]) # word_to_vec = cPickle.load(open('data/word_to_vec_pkl')) # print word_to_vec[0][5] # print word_to_vec[0][6] np.savetxt('output/word2vec_vectors.10d', word_to_vec, delimiter=' ')
def get_model(model_num, model_names): if model_num < 10: model = Word2Vec.load(model_path + model_names) elif model_num < 99: model = Doc2Vec.load(model_path + model_names) else: model = Word2Vec.load_word2vec_format(model_path + model_names, binary=True) # C text format return model
def retrain(orig_model_name, sentences, corpus_name, iter=10): orig_model_path = special_dir / orig_model_name model = Word2Vec.load(orig_model_path.as_posix()) nb_sentences = len(sentences) (special_dir / corpus_name).mkdir(exist_ok=True) for i in range(1, iter + 1): dest_name = "{}_{}_{}".format(orig_model_name, corpus_name, i) dest_path = special_dir / corpus_name / dest_name model.train(sentences, total_examples=nb_sentences) model.save(dest_path.as_posix())
def filterLists(engList, freList): freModel = Word2Vec.load("../models/defFrePunct.model") engRetList = [] freRetList = [] for engWord, freWord in zip(engList, freList): try: freModel[freWord.lower()] engRetList.append(engWord) freRetList.append(freWord) except: continue return engRetList, freRetList
def __init__(self,train_data,dev_data,test_data): self.train_data=train_data self.dev_data=dev_data self.test_data=test_data # Hyper-parameters self.learningRate=0.01 self.trainSize=2000 self.testSize=1000 self.totalSize = self.trainSize + self.testSize self.maxEpochs=10000 self.num_processed=-1 self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')
def load_embeddings(): ''' Loads the gensim word embedding model. ''' config = simple_config.load("embedding") from gensim.models.word2vec import Word2Vec f_w2v = os.path.join( config["output_data_directory"], config["w2v_embedding"]["f_db"], ) return Word2Vec.load(f_w2v)
def getDistRep(words,modelPath,dims): ''' Takes a list of words and returns distributed representation of words according to the model provided ''' # Load model 1 model = Word2Vec.load(modelPath) numWords = len(words) retMat = np.zeros((numWords,dims)) for idx, word in enumerate(words): # print word, chardet.detect(word) retMat[idx] = model[word] return retMat
def word_2_vec(self): print '向量化 start' '''格式化数据''' sentences = [] sentences_word = [] with open(self.__input_url, 'r') as f: for line in f.readlines(): line = line.strip().decode('utf-8') # 把末尾的'\n'删掉 sentences.append(line) line = line.split(' ') sentences_word.append(line) ''' 获取tf-idf矩阵 ''' vectorizer = TfidfVectorizer() tf_idf = vectorizer.fit_transform(sentences) word = vectorizer.get_feature_names() # 返回词数组 tf_idf = tf_idf.toarray() # 返回词tf_idf值的数组 print '句数:%d 词数:%d' % (len(sentences_word), len(word)) '''格式化每句每词的tf_idf值''' sentences_tf_idf = [] for si, sv in enumerate(sentences_word): for wi, wv in enumerate(sv): sentences_tf_idf.append([]) if wv in word: sentences_tf_idf[si].append(tf_idf[si][word.index(wv)]) else: sentences_tf_idf[si].append(0.0) # print si + 1, len(sentences_tf_idf[si]), sentences_tf_idf[si] '''训练词向量模型''' word2vec_size = 2000 if not os.path.exists('./source/model'): os.mkdir('./source/model') if os.path.exists('./source/model/word.model'): model = Word2Vec.load('./source/model/word.model') print 'model loaded success' else: model = Word2Vec(sentences_word, size=word2vec_size, min_count=0) model.save('./source/model/word.model') print 'model saved success to ./source/model/word.model' '''格式化 句-词向量 and 句向量''' sentences_word_vec = [] sentences_vec = [] for si, sv in enumerate(sentences_word): sum_x = np.array([0.0 for x in range(0, word2vec_size)]) for wi, wv in enumerate(sv): sentences_word_vec.append([]) temp = [] if wv in model.wv: for ci, cv in enumerate(model.wv[wv]): temp.append(cv) else: temp.append(0.0) # 句向量 = (tf_idf * 词向量)的和 sentences_word_vec[si].append( np.array(temp) * sentences_tf_idf[si][wi]) sum_x += np.array(temp) # print si + 1, len(sentences_word_vec[si]), sum_x sentences_vec.append(sum_x) print '向量化 done\n' '''kmeans聚类''' print '聚类 start' # 调用kmeans类 clf = KMeans(n_clusters=7) s = clf.fit(sentences_vec) print s print '聚类 done\n' # 7个中心 # print '中心', clf.cluster_centers_ # 每个样本所属的簇 # print len(clf.labels_), clf.labels_ # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 # print clf.inertia_ # 进行预测 # print clf.predict(sentences_vec) # # 保存模型 # joblib.dump(clf, 'c:/km.pkl') # # # 载入保存的模型 # clf = joblib.load('c:/km.pkl') # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 # clfinertia = [] # for i in range(5, 30): # clf = KMeans(n_clusters=i) # s = clf.fit(sentences_vec) # clfinertia.append(clf.inertia_) # print i, clf.inertia_ # # clfinertia_sum = 0 # con = 4000000 # for i in range(0, 23): # # sum_i = math.atan( # # clfinertia[i] / con * (i + 3) - clfinertia[i + 1] / con * (i + 3)) - math.atan( # # clfinertia[i + 1] / con * (i + 3) - clfinertia[i + 2] / con * (i + 3)) # sum_i = clfinertia[i] - clfinertia[i + 1] - (clfinertia[i + 1] - clfinertia[i + 2]) # # if clfinertia_sum < sum_i: # clfinertia_sum = sum_i # print i + 6, sum_i '''输出每类tf_idf值 and 分类文件''' print '输出分类 start' word_class_vec = [[] for x in range(0, len(clf.cluster_centers_))] fr = open('./source/thulac_out.txt', 'r') line_x = [] for line in fr.readlines(): line_x.append(line) for i in range(0, len(clf.cluster_centers_)): class_filename = './source/classes/class_%d.txt' % i if os.path.exists(class_filename): os.remove(class_filename) for i in range(0, len(clf.labels_)): class_i = clf.labels_[i] word_class_vec[class_i].append(tf_idf[i]) class_filename = './source/classes/class_%d.txt' % class_i with open(class_filename, 'a') as fw: fw.write(str(line_x[i])) # print line_x[i] print '输出分类 保存到 ./source/classes/' fr.close() print '输出分类 done\n' '''统计关键词词频''' print '统计词频 start' word_n = [[] for x in range(0, len(clf.cluster_centers_))] if not os.path.exists('./source/word_n'): os.mkdir('./source/word_n') for i in range(0, len(clf.cluster_centers_)): word_n[i] = [0.0 for x in range(0, len(word))] for si, sv in enumerate(word_class_vec[i]): for wi, wv in enumerate(sv): word_n[i][wi] += wv word_n_filename = './source/word_n/word_n%d.txt' % i if os.path.exists(word_n_filename): os.remove(word_n_filename) for si, sv in enumerate(word_n[i]): if int(sv) > 0: with open(word_n_filename, 'a') as fw: fw.write('%s %d\n' % (word[si], int(sv))) # print len(word_n[i]) print '输出词频 保存到 %s' % word_n_filename print '统计词频 done\n'
output = pd.DataFrame({'id': df.id, 'sentiment': result}) output.to_csv(os.path.join('.', 'data', file_name), index=False) output.head() del df del test_data_features if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 加载停用词 eng_stopwords = set(stopwords.words('english')) # eng_stopwords = {}.fromkeys([line.rstrip() for line in open('../stopwords.txt')]) # 加载之前训练好的Word2Vec模型 model_name = '300features_40minwords_10context.model' model = Word2Vec.load(os.path.join('.', 'models', model_name)) # 加载数据 df = load_dataset('labeled_train') # 将原始数据转换成词向量 train_data_features = df.review.apply(to_review_vector) # 训练分类器 forest = RandomForestClassifier(n_estimators=100, random_state=42) forest = forest.fit(train_data_features, df.sentiment) print('Train done.') # 清理占用内存的变量 del df del train_data_features # 做预测 predict(forest) print('Predict done.')
def load_w2v(dim=100): #载入词向量 if dim == 100: return Word2Vec.load('../data/wordvec/model100_20180703') return None
DATA_CONFIGS = 'data_configs.json' SEQ_CONFIGS = 'seq_configs_bt.json' # Train label save file name TRAIN_LABEL_DATA = 'train_label.npy' TRAIN_LABEL_SMALL = 'train_label_small.npy' TEST_LABEL_DATA = 'test_label.npy' TEST_LABEL_SMALL = 'test_label_small.npy' # pre-trained model load d2v_model_name = './model_save/embedding_model/Doc2vec_new.model' w2v_model_name = './model_save/embedding_model/Word2vec1.model' pre_trained_name = './model_save/embedding_model/trained_word2vec1.model' doc_vectorizer = Doc2Vec.load(d2v_model_name) word_vectorizer = Word2Vec.load(w2v_model_name) pre_trained_w2v = Word2Vec.load(pre_trained_name) train_X = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb')) test_X = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb')) if label_size == 'big': train_Y = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb')) train_YS = tf.one_hot(train_Y, 43) test_Y = np.load(open(DATA_IN_PATH + TEST_LABEL_DATA, 'rb')) test_YS = tf.one_hot(test_Y, 43) else: train_Y = np.load(open(DATA_IN_PATH + TRAIN_LABEL_SMALL, 'rb')) train_YS = tf.one_hot(train_Y, 455) test_Y = np.load(open(DATA_IN_PATH + TEST_LABEL_SMALL, 'rb')) test_YS = tf.one_hot(test_Y, 455)
tsne = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) Y = tsne.fit_transform(arr) x_coords = Y[:, 0] y_coords = Y[:, 1] # display scatter plot plt.scatter(x_coords, y_coords) for label, x, y in zip(word_labels, x_coords, y_coords): plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points') plt.xlim(x_coords.min() + 0.00005, x_coords.max() + 0.00005) plt.ylim(y_coords.min() + 0.00005, y_coords.max() + 0.00005) plt.show() model=Word2Vec.load("ConversationalService/Twitter_Sentiment_model_W2V") #model=Word2Vec.load("ConversationalService/updatedInsurance_word2vec_v3_18650_tri1") #display_closestwords_tsnescatterplot(model, 'pay') #vector building def ClassifierOnload(): vec = np.zeros(model.wv.syn0.shape[1]).reshape((1, model.wv.syn0.shape[1])) train_vecs_w2v = vec for utterence in list(dataset['Query']): print(utterence) words = Data_Cleaner(str(utterence)) count = 0 for word in words: try: print(word)
modified_str = Date2Str(modified) created_str = Date2Str(created) #文章番号(number_list)とそれぞれの文(docs), 語彙(word)のリストの作製 print('Preprocessing ...') docs, number_list, words = model_scdv.Preprocess_Mecab(number, data) #Word2Vecモデルの学習 path = glob.glob('./model/*.model') if (len(path) == 0): path = glob.glob('./pretrained_model/*.model') print('Loading pre-trained W2V model (' + str(path[0]) + ')...') model = Word2Vec.load(path[0]) print('Updating specified word vectors by Word2Vec...') model.train(sentences=docs, total_examples=len(docs), total_words=len(words), word_count=len(model.wv.index2word), epochs=1000) model.save("./model/" + dir_name + "_model.model") else: path = glob.glob('./model/*.model') print('Loading W2V model (' + str(path[0]) + ')...') model = Word2Vec.load(path[0]) #W2Vモデルから必要な単語ベクトルを取得
# for i in train_labels: # train_labels_trigrams.append(i) # train_labels_trigrams.append(i) # train_labels_trigrams.append(i) # write_new_file(1,2,test_sents) # print("DONE") # print get_Ngrams("This sentence is for testing",1,3) vectorizer =TfidfVectorizer(lowercase=True,max_features=max_features,ngram_range=(1,1),stop_words='english') X_train = vectorizer.fit_transform(train_sents_bigrams[:num_of_docs]) vocab_w2v = vectorizer.get_feature_names() X_test = vectorizer.transform(test_sents_bigrams) print 'obtained a vocab of len: {} from the training + testing set'.format(len(vocab_w2v)) # model = Word2Vec([i.translate(string.maketrans('\n',' ')).split() for i in wv2_train[:num_of_docs]],size=w2v_vect_dim,min_count=1) # model.save('C:/Users/admin/FYP/modelBigrams') # print("DONE") model = Word2Vec.load('modelBigrams') w2v = dict(zip(model.wv.index2word, model.wv.syn0)) # bow_classify (X_train,train_labels,X_test,test_labels) if deep_kernel_mode == 'diag': # word kernel is a DIAG matrix word_kernel = csr_matrix((len(vocab_w2v),len(vocab_w2v))) for i,w in enumerate(vocab_w2v): print i word_vec = w2v.get(w,np.zeros(shape=(w2v_vect_dim,))) word_kernel[i,i] = word_vec.dot(word_vec.T) elif deep_kernel_mode == 'pairwise': # word kernel is pairwise similarity # word_vect_as_in_vocab = np.zeros(shape=(len(vocab_w2v), w2v_vect_dim)) word_vect_as_in_vocab = csr_matrix((len(vocab_w2v), int(w2v_vect_dim)))
#python Word2Vec_AverageVectorsUtilities.py E:\semeval2016-task3-caq\qatarliving\qatarliving_qc_size100_win10_mincnt5_with_sent_repl_iter1.word2vec.bin if __name__ == '__main__': import logging from gensim.models.word2vec import Word2Vec logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #read publications file if len(sys.argv) > 1: word2vec_file_to_load = sys.argv[1] print('Word2vec file:\n\t%s' % word2vec_file_to_load) else: print('Error: missing input file parameter') quit() model = Word2Vec.load(word2vec_file_to_load) index2word = set(model.index2word) word2vec_num_features = len(model.syn0[0]) print "Feature vectors length:%s" % word2vec_num_features print "Model syn0 len=%d" % (len(model.syn0)) question_body = u'is there any place i can find scented massage oils in qatar?' answers = [u'Yes. It is right behind Kahrama in the National area.',\ u'whats the name of the shop?',\ u'It s called Naseem Al-Nadir. Right next to the Smartlink shop. You ll find the chinese salesgirls at affordable prices there.',\ u'dont want girls;want oil',\ u'Try Both ;) I am just trying to be helpful. On a serious note - Please go there. you ll find what you are looking for.',\ u'you mean oil and filter both',\ u'Yes Lawa...you couldn t be more right LOL',\ u'What they offer?',\
import numpy as np import pandas as pd import tensorflow as tf from gensim.models.word2vec import Word2Vec from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.utils import to_categorical from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, GRU, Dense import time start = time.time() train_articles_df = pd.read_pickle('train_articles_df.pkl') train_answers_df = pd.read_pickle('train_answers_df.pkl') train_articles_df = train_articles_df.drop(['id'], 1) train_answers_df = train_answers_df.drop(['id'], 1) w2v_model = Word2Vec.load("w2v2019-02-10-23_19_06.model") train_df = pd.concat([train_answers_df, train_articles_df], axis=1) train_df = train_df.sample(frac=1) # print(train_df.head) embedding_matrix = np.zeros( (len(w2v_model.wv.vocab.items()) + 1, w2v_model.vector_size)) word2idx = {} vocab_list = [(word, w2v_model.wv[word]) for word, _ in w2v_model.wv.vocab.items()] for i, vocab in enumerate(vocab_list): word, vec = vocab embedding_matrix[i + 1] = vec word2idx[word] = i + 1
# -*- coding: utf-8 -*- # 文本文件必须是utf-8无bom格式 from gensim.models.word2vec import Word2Vec import gensim model = Word2Vec.load('./word2vecModel/word_embedding.model') #model = gensim.models.KeyedVectors.load_word2vec_format('C:\\Users\\lanxum\\Desktop\\mymodel.model') # 3个文件放在一起:Word60.model Word60.model.syn0.npy Word60.model.syn1neg.npy print("read model successful") word_list = [ '教育', '不存在的词', '的', '我', '你', '他', '个', '1', '完成', '吃', '苹果', '香蕉', '词汇', '物理', '地球', '黑死病', '瘟疫', '', ] for word in word_list: if word in model.wv.index2word:
# Shuffle final reviews and labels combined_lists = zip(final_reviews, final_labels) np.random.shuffle(combined_lists) final_reviews[:], final_labels[:] = zip(*combined_lists) print "Returning %d funny reviews and a total of %d reviews" % ( num_funny_reviews, len(final_reviews)) return (final_reviews, final_labels) WORD2VEC_MODEL = "w2v_70_parts_100_vector_4_window" PARTITIONS_TRAINING = range(1, 30) #15 PARTITIONS_TESTING = range(50, 53) #22 w2vmodel = Word2Vec.load(WORD2VEC_MODEL) reviews_train, _, funny_votes_train, _, _ = BaseBowRegressor.get_reviews_data( PARTITIONS_TRAINING) reviews_train, labels_train = give_balanced_classes(reviews_train, funny_votes_train) print "Tokenizing" NUM_ELEMENTS_TRAIN = None NUM_ELEMENTS_TEST = None reviews_tokens_train = [ language.tokenize_document((i, unicode(txt))) for (i, txt) in enumerate(reviews_train[:NUM_ELEMENTS_TRAIN]) ] X_train = tokens_to_word_vectors(reviews_tokens_train, w2vmodel)
def map_text_list_to_embedding(text_list, label_for_text_list, num_labels, label_to_id): """ Parameters ---------- text_list: list of str List of text label_for_text_list: list of str List of labels, which is the ground truth for each text on the text_list num_labels: Number of labels label_to_id: dict Label to integer id mapping Returns ------- x: ndarray Numpy array of mean word embeddings for each text. y: ndarray Numpy array of indices representing labels missing_words: set Set of words not in the Word2Vec model's dictionary. """ model = Word2Vec.load(MODEL_PATH) missing_words = set() x_list = list() y_list = list() total_found_in_dict = 0 total_not_in_dict = 0 for i, text in enumerate(text_list): log.debug("Processing post: [%d]" % (i + 1)) words_in_text = map_text_to_word_list(text) word_v_list = list() for w in words_in_text: try: v = model[w] except KeyError: missing_words.add(w) #log.warning("Skipping %s" % (w)) total_not_in_dict += 1 continue word_v_list.append(v) total_found_in_dict += 1 if len(word_v_list) == 0: # log.warning("Did not find any words in vocabulary. Skipping the text.") continue # For now, do not change non-zero element to 1. label_id = label_to_id[label_for_text_list[i]] label_id = keras.utils.to_categorical(label_id, num_labels).astype(np.float32) label_id = label_id.reshape(1, num_labels) # Squish word_id_list word_v_np = np.array(word_v_list) word_count = word_v_np.shape[0] word_v_mean = np.sum(word_v_np, axis=0) / word_count word_v_sum = np.sum(word_v_np, axis=0) #log.info("word_v_mean.shape") #log.info(word_v_mean.shape) x_list.append(word_v_mean) # x_list.append(word_v_sum) y_list.append(label_id) x = np.array(x_list) print(x.shape) y = np.concatenate(y_list) assert x.shape[0] == y.shape[0] log.info("Number of words found in dict: %d" % (total_found_in_dict)) log.info("Number of words not found in dict: %d" % (total_not_in_dict)) return x, y, missing_words
def load_model(self, fpath): embeddings_file = fpath return Word2Vec.load(embeddings_file)
import torch.nn as nn import matplotlib.pyplot as plt from gensim.models.word2vec import Word2Vec import numpy as np import pandas as pd import time from model_compare import ASTNN TRAINING_SET_SIZE = 30000 VALIDATION_SET_SIZE = 10000 TEST_SET_SIZE = 10000 print('Reading data...') w2v = Word2Vec.load('./data/c/w2v_128').wv embeddings = torch.tensor(np.vstack([w2v.vectors, [0] * 128])) programs = pd.read_pickle('./data/c/id_code_label_ast_(index_tree).pkl') training_set = programs[:TRAINING_SET_SIZE] validation_set = programs[TRAINING_SET_SIZE:TRAINING_SET_SIZE + VALIDATION_SET_SIZE] test_set = programs[TRAINING_SET_SIZE + VALIDATION_SET_SIZE:TRAINING_SET_SIZE + VALIDATION_SET_SIZE + TEST_SET_SIZE] def get_batch(dataset, i, batch_size): return dataset.iloc[i:i + batch_size]
def most_similar(): model = Word2Vec.load("./result/embedding.model") print("용돈과 관련된 키워드 : ", model.most_similar("용돈")) print("졍이와 관련된 키워드 : ", model.most_similar("졍이")) print("쭈니와 관련된 키워드 : ", model.most_similar("쭈니"))
from gensim.models.word2vec import Word2Vec from scipy import spatial from sklearn.metrics import confusion_matrix #设置目录环境 root_path = '' pd.set_option('display.width', 1000) #设置字符显示宽度 pd.set_option('display.max_columns', 1000) #设置显示最大列 pd.set_option('display.max_rows', 1000) #设置显示最大行 #设置jieba字典 jieba.set_dictionary(os.path.join(root_path, 'corpus/dict.txt.big')) jieba.load_userdict(os.path.join(root_path, 'corpus/medical_term.txt')) #装载预训练模型 med_model = Word2Vec.load(os.path.join(root_path, "model/med_word2vec.model")) index2word_set = set(med_model.wv.index2word) #读取txt文件 #输入:file_name: 文件地址 #输出: lines: 行内容列表 def loadfile(file_name): file_path = os.path.join(root_path, file_name) lines = [] with open(file_path, 'r', encoding='utf-8') as f_stop: for line in f_stop: lines.append(line.replace('\n', '')) return lines
if not os.path.exists(vocab_path): vocab_model = Word2Vec(size=embedding_size, max_vocab_size=max_vocab_size, min_count=min_word_count, workers=2, seed=2245) print('{0}: Building own vocabulary'.format(datetime.datetime.now())) desc_generator = basic_desc_generator(train_path) vocab_model.build_vocab(desc_generator) print('{0}: Saving vocabulary to {1}'.format(datetime.datetime.now(), vocab_path)) vocab_model.save(vocab_path) vocab_model = Word2Vec.load(vocab_path) if use_google_word2vec and __name__ == "__main__": ## Google word2vec # Load pre-trained embeddings assert embedding_size == 300 #Take the first bunch of words, these are sorted by decreasing count #so these will be the most important, and it saves a bunch of space/time #Save vocab for future use if not os.path.exists(word2vec_model_path): print('Loading word2vec embeddings from {0:}'.format(google_word2vec)) model = KeyedVectors.load_word2vec_format(google_word2vec, limit=max_vocab_size, binary=True) model.init_sims(replace=True)
def __init__(self): self.model = Word2Vec.load('./Model/ko_en.mdl')
def raiseError(error): return error if __name__ == '__main__': global model #----------- Parsing Arguments --------------- p = argparse.ArgumentParser() p.add_argument("--model", help="Path to the trained model") p.add_argument("--binary", help="Specifies the loaded model is binary") p.add_argument("--host", help="Host name (default: localhost)") p.add_argument("--port", help="Port (default: 5000)") p.add_argument("--path", help="Path (default: /word2vec)") args = p.parse_args() model_path = args.model if args.model else "/home/fox/xavier_corpus/word2vec/sgns-50-tra.model" binary = True if args.binary else False host = args.host if args.host else "localhost" path = args.path if args.path else "/word2vec" port = int(args.port) if args.port else 5000 if not args.model: print "Usage: word2vec-api.py --model path/to/the/model [--host host --port 1234]" model = w.load(model_path) api.add_resource(N_Similarity, path + '/n_similarity') api.add_resource(Similarity, path + '/similarity') api.add_resource(MostSimilar, path + '/most_similar') api.add_resource(Model, path + '/model') api.add_resource(ModelWordSet, '/word2vec/model_word_set') app.run(host=host, port=port)
from gensim.models.word2vec import Word2Vec from multiprocessing import cpu_count import gensim.downloader as api from pprint import pprint # Download dataset dataset = api.load("text8") data = [d for d in dataset] # Split the data into 2 parts. Part 2 will be used later to update the model data_part1 = data[:1000] data_part2 = data[1000:] # Train Word2Vec model. Defaults result vector size = 100 model = Word2Vec(data_part1, min_count=0, workers=cpu_count()) # Get the word vector for given word pprint(model['topic']) pprint(model.most_similar('topic')) # Save and Load Model model.save('newmodel') model = Word2Vec.load('newmodel') #Update existing Word2Vec Model with the new data model.build_vocab(data_part2, update=True) model.train(data_part2, total_examples=model.corpus_count, epochs=model.iter) pprint(model['topic'])
if len(data) > 900: pass if len(data) <= maxlen: data = data + [fill_0] * (maxlen - len(data)) else: data = data[:maxlen] return data if __name__ == "__main__": CORPUSPATH = "./data/NVD/corpus/" VECTORPATH = "./data/vector/" W2VPATH = "w2v_model/wordmodel_min_iter5.model" print("turn the corpus into vectors...") model = Word2Vec.load(W2VPATH) for testcase in os.listdir(CORPUSPATH): print("\r" + testcase, end='') if testcase not in os.listdir(VECTORPATH): folder_path = os.path.join(VECTORPATH, testcase) os.mkdir(folder_path) for corpusfile in os.listdir(CORPUSPATH + testcase): corpus_path = os.path.join(CORPUSPATH, testcase, corpusfile) f_corpus = open(corpus_path, 'rb') data = pickle.load(f_corpus) f_corpus.close() data.append(data[0]) data[0] = generate_corpus(model, data[0]) vector_path = os.path.join(VECTORPATH, testcase, corpusfile) f_vector = open(vector_path, 'wb') pickle.dump(data, f_vector)
the_sample_index = TestBatchWordIndex[i] for j in range(maxSeqLength): the_sample_vec.append(model.wv[allVocabList[the_sample_index[j]]]) TestBatchWordVec.append(the_sample_vec) TestBatchWordVec = np.array(TestBatchWordVec) TestBatchLabel = np.array(TestBatchLabel) return TestBatchSampleIndex, TestBatchWordVec, TestBatchLabel if __name__ == "__main__": print("CASDMN_Model") model = Word2Vec.load(corpusWord2Vect) Pos_Txt_Index_List = list(np.load(Pos_Txt_Index_List_Path)) Neg_Txt_Index_List = list(np.load(Neg_Txt_Index_List_Path)) Train_Set, Valid_Set, Test_Set = getSplitSets() tf.reset_default_graph() labels = tf.placeholder(tf.float32, [batchSize, numClasses]) input_text = tf.placeholder(tf.float32, [batchSize, maxSeqLength, wordDim]) input_emoji = tf.placeholder(tf.float32, [batchSize, wordDim]) # (Bi-)RNN layer(-s) seq_len_ph = [] for i in range(batchSize): seq_len_ph.append(maxSeqLength) rnn_outputs, _ = bi_rnn(GRUCell(hiddenSize),
# Параметры num_features = 300 min_word_count = 1 num_workers = 4 window_size = 6 subsampling = 1e-3 # Создание экземпляра model = Word2Vec(token_list, workers=num_workers, size=num_features, min_count=min_word_count, window=window_size, sample=subsampling) # Заморозка модели, исключение ненужных выходных весов model.init_sims(replace=True) # Сохранение model_name = "vk_comment_model" model.save(model_name) # Загрузка model_name = "vk_comment_model" model = Word2Vec.load(model_name) stemmer = SnowballStemmer('russian') print(model.wv.similarity(stemmer.stem("поезд"), stemmer.stem("Пусан"))) print(model.wv[stemmer.stem("поезд")])
max_score = vec[0, argmax(vec)] max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1]) return max_score + \ torch.log(torch.sum(torch.exp(vec - max_score_broadcast))) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #################################################################### # prepare datasets import os import csv import torch import numpy as np from gensim.models.word2vec import Word2Vec model_path = '/home/cl/jungmin-c/japanese-word2vec-model-builder/output/word2vec.gensim.model' lang_model = Word2Vec.load(model_path) directory_train = "/home/cl/jungmin-c/legal-i_corpus/AIL2019/RSC_original2/" directory_test = "/home/cl/jungmin-c/legal-i_corpus/test_hanrei/" tag_to_ix = {} def build_emb(directory): data = [] for item in os.listdir(directory): datafile = open(directory + item,'r') reader = csv.reader(datafile) next(reader) sentences = [] tags = [] for row in reader: sentence = row[0].split() sentence = list(filter(lambda x: x in lang_model.wv.vocab, sentence))
print("taggedFile Not Saved.") # building model model = intent_vectorization.build_model(tagged_data) ''' # path to save/load model model_path = "Models\intentModelArxiveWord2Vec" # checking if the model got saved ''' if intent_vectorization.save_model(model, model_path): print("Model Saved") else: print("Model Not saved") ''' # loading saved model model = Word2Vec.load(model_path + ".model") # testing test_data = "Youtube" test_tokenized = [ token.lemma_ for token in intent_vectorization.nlp(test_data.lower()) if not token.is_stop and len(token.text) > 2 ] print(test_tokenized) v1 = model.wv.most_similar(test_tokenized) print("V1_infer", v1) # to find most similar doc using tags (returns 10 most similar docs according to cosine similarity) similar_doc = model.docvecs.most_similar(positive=[v1]) print(similar_doc)
# ================================================================================ # After finishing the training, unload useless data from the memory # model.init_sims(replace=True) # ================================================================================ # Checkpoint file name for trained W2V model model_name = './Models/300features_40minwords_10text' # model_name='./Models/300features_50minwords_20text' model.save(model_name) # train_W2V_model_and_save_checkpoint_file(sentences) # ================================================================================ model = Word2Vec.load('./Models/300features_40minwords_10text') # model=gensim.models.Word2Vec('./Models/300features_40minwords_10text') # model=gensim.models.Word2Vec.load('model') # ================================================================================ sample_words = 'man woman child kitchen'.split() # print("sample_words",sample_words) # ['man', 'woman', 'child', 'kitchen'] # ================================================================================ abnormal_word = model.wv.doesnt_match(sample_words) # print("abnormal_word",abnormal_word) # kitchen # ================================================================================ country_names = "france england germany berlin".split()
evaluation of BalancedBaggingClassifiers trained on top of them. """ from w2v_vectorizers import MeanEmbeddingVectorizer from gensim.models.word2vec import Word2Vec from sklearn.externals import joblib from multiprocess import Pool import pandas as pd import glob, os if __name__ == "__main__": train_data = pd.read_csv( "/data/SO_data/downvoter/wv_train_processed_data.csv") val_data = pd.read_csv("/data/SO_data/downvoter/wv_val_processed_data.csv") wv_models = [Word2Vec.load(f) for f in glob.glob("./word_models/*.model*")] path = "/data/SO_data/downvoter/vectorized_data/" def process_model(wv_model): size = wv_model.vector_size window = wv_model.window print("Vectorizing s=%d, w=%d" % (size, window)) vectorizer = MeanEmbeddingVectorizer(wv_model) ext = ".w2v.s%d.w%d.pkl" % (size, window) print("Body train set...") if not os.path.isfile("".join([path, "train_body", ext])): joblib.dump(vectorizer.transform(train_data.body), "".join([path, "train_body", ext]))
def get_predict_vecs(words): n_dim = 300 imdb_w2v = Word2Vec.load('../svm_data/w2v_model/w2v_model.pkl') train_vecs = buildWordVector(words, n_dim, imdb_w2v) return train_vecs
from multiprocessing import Process, Value, Queue, cpu_count from time import sleep from sklearn.feature_extraction.text import TfidfVectorizer from gensim.models.word2vec import Word2Vec from sklearn.cluster import KMeans from gensim.models.keyedvectors import KeyedVectors from math import inf import torch import pickle args = NLP_args(k=30, min=0.0, random=0,min_cls=5,lr=0.0005) labels_dict=pickle.load(open("labels_dict.pkl", "rb")) word2vec_for_kmeans_model = Word2Vec.load("word2vec_for_kmeans_model.model") tfidf_model=pickle.load(open("tfidf_model.pkl", "rb")) word2vec_for_rnn_model = Word2Vec.load("word2vec_for_rnn_model.model") rnn_model = RNN(args.word2vec_vec_size_for_rnn, args.hidden_layer, len(labels_dict)) rnn_model.load_state_dict(torch.load('w2v_5_rnn_model.pth')) rnn_model.eval() random_forest_model=pickle.load(open("random_forest_model.pkl", "rb")) global number_of_free_processes number_of_free_processes = Value('i', cpu_count(), lock=True)
def word2vec_train(): model = Word2Vec.load('./vec_model/Word2vec_model.pkl') w2indx, w2vec = create_dictionaries(model) return w2indx, w2vec