def get_w2v_labels(y_original, dim=200): y_new = np.zeros((y_original.shape[0], dim)) if dim == 200: model = word2vec.load(root + 'word2vec/vectors.bin') elif dim in [100,50,25,10]: model = word2vec.load(root + 'semantic-network/data/text8-%s.bin'%dim) else: raise NotImplementedError for i, label in enumerate(y_original): y_new[i,:] = model[classes[label]] return y_new
def get_char_embedding(): """提取字向量,并保存至 ../data/char_embedding.npy""" print('getting the char_embedding.npy') wv = word2vec.load('../raw_data/char_embedding.txt') char_embedding = wv.vectors chars = wv.vocab n_special_sym = len(SPECIAL_SYMBOL) sr_id2char = pd.Series(chars, index=range(n_special_sym, n_special_sym + len(chars))) sr_char2id = pd.Series(range(n_special_sym, n_special_sym + len(chars)), index=chars) # 添加特殊符号:<PAD>:0, <UNK>:1 embedding_size = 256 vec_special_sym = np.random.randn(n_special_sym, embedding_size) for i in range(n_special_sym): sr_id2char[i] = SPECIAL_SYMBOL[i] sr_char2id[SPECIAL_SYMBOL[i]] = i char_embedding = np.vstack([vec_special_sym, char_embedding]) # 保存字向量 save_path = '../data/' if not os.path.exists(save_path): os.makedirs(save_path) np.save(save_path + 'char_embedding.npy', char_embedding) # 保存字与id的对应关系 with open(save_path + 'sr_char2id.pkl', 'wb') as outp: pickle.dump(sr_id2char, outp) pickle.dump(sr_char2id, outp) print('Saving the char_embedding.npy to ../data/char_embedding.npy')
def predict(): model = word2vec.load('./latents.bin') predictions = [] with open('MSRParaphraseCorpus/MSR_easy.txt') as f: data = f.readlines() block = [] for each in data: block.append(flex(getWords(each.lower()))) i = 1 while i+1 < len(block): if int(block[i][0]) - int(block[i+1][0]) < 200 and int(block[i][0]) - int(block[i+1][0]) > -200: t1 = block[i][1:] t2 = block[i+1][1:] t = union(t1, t2) # -------------- sementic similarity between two sentences ------- # similarity_ssv = ssv(t, t1, t2, model) #print 'ssv ', similarity_ssv # ----------------- word similarity between sentences ------------ # similarity_wo = wo(t, t1, t2, model) #print 'wo ', similarity_wo alpha = 0.8 similarity = alpha*similarity_ssv + (1-alpha)*similarity_wo print similarity, str(block[i][0]), str(block[i+1][0]) predictions.append([similarity, str(block[i][0]), str(block[i+1][0])]) i = i + 2 else: i = i + 1
def loadArg1(): model=word2vec.load("/mnt/mint_share/text8.bin") data=np.empty((17572,1,100,100),dtype='float64') label=np.empty((17472,),dtype='uint8') with codecs.open("/mnt/mint_share/train_pdtb.json","rU","utf-8") as f: for i,line in enumerate(f): unit=json.loads(line) len1 = len(unit['Arg1']['Word']) if(len1 <100): for j in range(len1): try: j_ = model[unit['Arg1']['Word'][j]] except: j_ = model['fillin'] data[i,:,j,:]= j_ for j in range(100- len1): data[i,:,len1+j,:]=model['fillin'] else: for j in range(100): try: j_ = model[unit['Arg1']['Word'][j]] except: j_ = model['fillin'] data[i,:,j,:]= j_ with open("arg1_image_100","wb") as f1: # dill.dump(data,f1) cPickle.dump(data,f1,protocol=2)
def test(): # ------------ common between two measurments ---------------------------- # t1 = "a quick brown dog jumps over the lazy fox" t2 = "a quick brown fox jumps over the lazy dog" t2 = "jumps over the lazy fox is a quick brown dog" #t1 = "Amrozi accused his brother, whom he called the witness, of deliberately distorting his evidence.".lower() #t2 = "Referring to him as only the witness, Amrozi accused his brother of deliberately distorting his evidence.".lower() #t1 = "i have to find you, tell me you need me." #t2 = "don't wanna know who is taking you home" t1 = getWords(t1) t2 = getWords(t2) t1 = flex(t1) t2 = flex(t2) t = union(t1, t2) #t = ["a", "brown", "jumps", "the", "fox", "dog", "quick", "over", "lazy"] print t model = word2vec.load('./latents.bin') # -------------- sementic similarity between two sentences --------------- # similarity_ssv = ssv(t, t1, t2, model) print 'ssv ', similarity_ssv # ----------------- word similarity between sentences -------------------- # similarity_wo = wo(t, t1, t2, model) print 'wo ', similarity_wo alpha = 0.8 print alpha*similarity_ssv + (1-alpha)*similarity_wo
def embed(sentences): model = word2vec.load('~/word2vec_models/GoogleNews-vectors-negative300.bin') embedded_sentences = [] tokenized_sentences = [] max_len = 0 for sentence in sentences: tokenized_sentence = sent_tokenize(sentence) tokenized_sentences.append(tokenized_sentence) if len(tokenized_sentence) > max_len: max_len = len(tokenized_sentence) for sentence in sentences: tokenized_sentence = sent_tokenize(sentence) embedded_words = [] for word in tokenized_sentence: try: word = model['word'] except: word = np.zeros(300) embedded_words.append(word) #padding for i in range(max_len - len(embedded_words)): embedded_words.append(np.zeros(300)) embedded_sentences.append(embedded_words) embedded_sentences = np.array(embedded_sentences) return embedded_sentences
def save_latent_features_of_tagsjson(): model = word2vec.load('../lib/word2vec/vectors.bin') all_tags = [] with open('tags.json', 'r') as f: data = json.load(f) i=0 while i < len(data['item']): all_tags = all_tags + data['item'][i]['tag_text'].replace('"','').lower().split('|') all_tags = all_tags + data['item'][i]['tag_query'].replace('"','').lower().split('|') i=i+1 i=0 while i < len(all_tags): if all_tags[i][0] == ' ': all_tags[i] = all_tags[i][1:] i=i-1 i=i+1 print all_tags latent_tags=[] latent_model=[] for i in all_tags: try: a=model[str(i)] latent_tags.append(str(i)) latent_model.append(a) except Exception, e: print i print e
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,model,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(model.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...")
def test_distance(): model = word2vec.load(output_txt) metrics = model.distance("the", "the", "the") assert len(metrics) == 3 for item in metrics: # There should be 3 items per record assert len(item) == 3
def create_voabulary(simple=None,word2vec_model_path='zhihu-word2vec-title-desc.bin-100',name_scope=''): #zhihu-word2vec-multilabel.bin-100 cache_path ='cache_vocabulary_label_pik/'+ name_scope + "_word_voabulary.pik" print("cache_path:",cache_path,"file_exists:",os.path.exists(cache_path)) if os.path.exists(cache_path):#如果缓存文件存在,则直接读取 with open(cache_path, 'r') as data_f: vocabulary_word2index, vocabulary_index2word=pickle.load(data_f) return vocabulary_word2index, vocabulary_index2word else: vocabulary_word2index={} vocabulary_index2word={} if simple is not None: word2vec_model_path='zhihu-word2vec.bin-100' print("create vocabulary. word2vec_model_path:",word2vec_model_path) model=word2vec.load(word2vec_model_path,kind='bin') vocabulary_word2index['PAD_ID']=0 vocabulary_index2word[0]='PAD_ID' special_index=0 if 'biLstmTextRelation' in name_scope: vocabulary_word2index['EOS']=1 # a special token for biLstTextRelation model. which is used between two sentences. vocabulary_index2word[1]='EOS' special_index=1 for i,vocab in enumerate(model.vocab): vocabulary_word2index[vocab]=i+1+special_index vocabulary_index2word[i+1+special_index]=vocab #save to file system if vocabulary of words is not exists. if not os.path.exists(cache_path): #如果不存在写到缓存文件中 with open(cache_path, 'a') as data_f: pickle.dump((vocabulary_word2index,vocabulary_index2word), data_f) return vocabulary_word2index,vocabulary_index2word
def test_load_txt(): model = word2vec.load(output_txt) vocab = model.vocab vectors = model.vectors assert vectors.shape[0] == vocab.shape[0] assert vectors.shape[0] > 3000 assert vectors.shape[1] == 10
def test_closest(): model = word2vec.load(output_txt) indexes, metrics = model.closest(model["the"], n=30) assert indexes.shape == (30, ) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() assert len(py_response) == 30 assert len(py_response[0]) == 2
def load(modelpath): model = word2vec.load(modelpath) nvocab = [ unicode(i,'utf-8') for i in model.vocab ] index = { v:n for n,v in enumerate(nvocab) } l2norm = model.l2norm return (index,l2norm)
def get_feats(seqs, train=False): print "get_feats" vec_model, dim = word2vec.load(vecfile) zero_vec = data_util.zero_vec(dim) feats = [] labels = [] global label_set label_set = set([]) for s in seqs: s_feats = [] s_labels = [] for pair in s: word = pair[0] vector = word2vec.get(word, vec_model) s_feats.append(vector) s_labels.append(pair[1]) label_set.add(pair[1]) feats.append(s_feats) labels.append(s_labels) if train: num_labels = len(list(label_set)) create_labelencoder(list(label_set), num_labels) global max_seq_len #max_seq_len = max([len(txt) for txt in feats]) print "max_seq_len: " + str(max_seq_len) # Pad sequences #feats = pad_sequences(numpy.array(feats), maxlen=max_seq_len, dtype='float32', padding="pre") #labels = pad_sequences(numpy.array(labels), maxlen=max_seq_len, dtype='str', padding="pre", value='O') padded_feats = [] padded_labels = [] for feat in feats: #print "seq len: " + str(len(feat)) while len(feat) > max_seq_len: feat_part = feat[0:max_seq_len] padded_feats.append(pad_feat(feat_part, max_seq_len, zero_vec)) feat = feat[max_seq_len:] new_feat = pad_feat(feat, max_seq_len, zero_vec) padded_feats.append(new_feat) for labs in labels: while len(labs) > max_seq_len: labs_part = labs[0:max_seq_len] padded_labels.append(pad_feat(labs_part, max_seq_len, 'O')) labs = labs[max_seq_len:] padded_labels.append(pad_feat(labs, max_seq_len, 'O')) feats = padded_feats labels = padded_labels # Encode labels encoded_labels = encode_labels(labels, max_len=max_seq_len) print "labels[0]: " + str(encoded_labels[0]) #for row in labels: # encoded_row = encode_labels(row) # encoded_labels.append(encoded_row) print "feats: " + str(len(feats)) + " labels: " + str(len(encoded_labels)) return feats, encoded_labels
def load_matrix(bin_path, input2idx): model = word2vec.load(bin_path) vector_dim = model.vectors.shape[1] matrix = np.zeros((len(input2idx), vector_dim)) for word, i in input2idx.items(): embedding_vector = model[word] if embedding_vector is not None: matrix[i] = embedding_vector return matrix
def get_embed(csv, col, embed_file): if os.path.exists(embed_file): return word2vec.load(embed_file) def csv2txt(text, voc): text = ' '.join([item for item in text.split() if item in voc]) text += '\n' with open('word_token.txt', 'a')as f: f.write(text) os.system('rm word_token.txt') vec = TfidfVectorizer(max_df=0.9, min_df= 3,smooth_idf=1, sublinear_tf=1) vec.fit(csv[col]) voc = vec.vocabulary_ csv[col].apply(csv2txt, args=[voc]) word2vec.word2vec('word_token.txt', embed_file, 256, verbose=1) return word2vec.load(embed_file)
def test_prediction(): model = word2vec.load(output_bin) indexes, metrics = model.cosine('the') assert indexes.shape == (10,) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() assert len(py_response) == 10 assert len(py_response[0]) == 2
def save(): model = word2vec.load('data/vec_google.bin') logging.info('loading word vectors to redis') for index, word in enumerate(model.vocab): save_word_vector(word, model[word]) if index % 1000 == 0: logging.info(index)
def test_prediction(): model = word2vec.load(output_bin) indexes, metrics = model.cosine('the') assert indexes.shape == (10, ) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() assert len(py_response) == 10 assert len(py_response[0]) == 2
def __init__(self, file_path): # w2v_file = os.path.join(base_path, "vectors_poem.bin") self.model = word2vec.load(file_path) if 'unknown' not in self.model.vocab_hash: unknown_vec = np.random.uniform(-0.1, 0.1, size=128) #生成120个-0.1到0.1的数 self.model.vocab_hash['unknown'] = len(self.model.vocab) self.model.vectors = np.row_stack( (self.model.vectors, unknown_vec)) # np.row_stack水平拼接两个参数
def test_analogy(): model = word2vec.load(output_txt) indexes, metrics = model.analogy(pos=["the", "the"], neg=["the"], n=20) assert indexes.shape == (20,) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() assert len(py_response) == 20 assert len(py_response[0]) == 2
def test_closest(): model = word2vec.load(output_txt) indexes, metrics = model.closest(model["the"], n=30) assert indexes.shape == (30,) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() assert len(py_response) == 30 assert len(py_response[0]) == 2
def test_analogy(): model = word2vec.load(output_txt) indexes, metrics = model.analogy(pos=['the', 'the'], neg=['the'], n=20) assert indexes.shape == (20, ) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() assert len(py_response) == 20 assert len(py_response[0]) == 2
def __init__(self,\ patchlength=3,\ maxlength=700,\ embedding_size=100,\ num_verbs=2,\ allinclude=False,\ shorten=False,\ shorten_front=False,\ testflag=False): #几句前文是否shorten #是否输出不带tag,只有单词的句子 #patchlength:每次输入前文额外的句子的数量. #maxlength:每句话的最大长度.(包括前文额外句子).超过该长度的句子会被丢弃. #embedding_size:词向量维度数. self.url = 'http://166.111.139.15:9000' self.shorten = shorten self.shorten_front = shorten_front #几句前文是否shorten #是否输出不带tag,只有单词的句子 self.patchlength = patchlength self.maxlength = maxlength self.embedding_size = embedding_size self.num_verbs = num_verbs self.allinclude = allinclude self.verbtags = ['VB', 'VBZ', 'VBP', 'VBD', 'VBN', 'VBG'] #所有动词的tag self.model = word2vec.load('tense/combine100.bin') #加载词向量模型 self.tagdict = {')': 0} print('loaded model') self.oldqueue = Queue() self.testflag = testflag if testflag == False: self.resp = open(r'tense/resp2').readlines() self.readlength = len(self.resp) print('readlength', self.readlength) # self.pointer=random.randint(0,self.readlength-1) self.pointer = 0 print('pointer', self.pointer) for _ in range(self.patchlength): self.oldqueue.put(self.resp[self.pointer]) self.pointer += 1 else: for _ in range(self.patchlength): if shorten_front == True: self.oldqueue.put(input()) else: self.oldqueue.put(self.parse(input())) self.cldict = dict() #加载文字 #加载原型词典(把动词变为它的原型) with open('tense/ldict2', 'rb') as f: self.ldict = pickle.load(f) with open('tense/tagdict', 'rb') as f: self.tagdict = pickle.load(f) with open('tense/cldict', 'rb') as f: self.cldictori = pickle.load(f) print('loaded lemma')
def load_wv_model(word_vector_file, word_vector_type): if word_vector_type == WordVectorTypes.glove.name: #from glove import Glove glove_model = GloveWrapper.load(word_vector_file) wv_model = GloveWrapper(glove_model) else: import word2vec w2v_model = word2vec.load(word_vector_file) wv_model = W2VWrapper(w2v_model) return wv_model
def test_similar(): model = word2vec.load(output_bin) indexes, metrics = model.similar("the") assert indexes.shape == (10, ) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() print(py_response) assert len(py_response) == 10 assert len(py_response[0]) == 2
def sentiment(test): model = CreateModel() #FitModel(model) Vector = word2vec.load("vectors.bin") print() vec=Vector[test] print(vec) t = model.predict(vec) print(t) return t
def __init__(self): self.word2vec_model = None self.cosine_similarity_map = {} self.word_vectors_map = {} # print 'Loading word vectors into the python model ...' start_time = time.time() self.word2vec_model = wv.load(cap.absolute_path+'./wordvectors/pubmed.bin') print 'The execution time for the loading was ', time.time()-start_time print 'word2vec_model.vocab', self.word2vec_model.vocab
def main(em_file, em_result): ''' embedding ->numpy ''' em = word2vec.load(em_file) vec = (em.vectors) word2id = em.vocab_hash # d = dict(vector = vec, word2id = word2id) # t.save(d,em_result) np.savez_compressed(em_result, vector=vec, word2id=word2id)
def train(self): if not os.path.isfile(self.trained_fname): print("Previous training '" + self.trained_fname + "' not found. Begin training on input '" + self.input_fname + "' into " + str(self.train_dimensions) + " dimensions ...") self.trained_fname = 'src/resources/output' + self.train_dimensions word2vec.word2vec(self.input_fname, self.trained_fname, size=self.train_dimensions) else: print("Trained data seems to exist at '" + self.trained_fname + "'") print("Loading training results...") self.model = word2vec.load(self.trained_fname, kind='bin')
def salt(self): print '\nfrom salt !!!!!', '\n' model = word2vec.load('./ActionsA/latents.bin') with open(self.conversation_filepath + 'conversation.csv') as fh: f = map(lambda x: x.split(","), filter(lambda x: (x != ""), fh.read().split("\n"))) for each in f: print distance(each[0], 'very well said i bet but i need more beer', model)
def __init__(self, originData=None, w2vModelPath="vectors.w2v", vectorSize=100): self.__model = None self.__vectorSize = vectorSize if type(originData) is str: word2vec.word2vec( originData, w2vModelPath, size=vectorSize, verbose=True) self.__model = word2vec.load(w2vModelPath)
def __init__(self, coefficient: int = 0.4): self.coefficient = coefficient print("Starting loading model for word2vec...") self.model = load(filename_start) print("Successfully loaded!") self.tags = [ "VERB", "NOUN", "ADV", "DET", "ADJ", "SCONJ", "INTJ", "X", "NUM", "PART", "ADP", "PRON", "X" ] self.commands = []
def extract(dim, data, trained): if(not trained): word2vec.word2phrase(data, data+'-phrases', verbose=True) word2vec.word2vec(data+'-phrases', data+'.bin', size=dim, verbose=True) model = word2vec.load(data+'.bin') keys = model.vocab features = model.vectors dic = dict(zip(keys,features)) print(len(dic)) return dic
def __init__(self, embeddings_path=None): file_path = Path + '/conf/system.properties' self.props = propertyUtil.parse(file_path) if embeddings_path is None: embeddings_path = self.props.get("EMBEDDING_PATH") model = word2vec.load(Path + '/' + embeddings_path) self.model = model self.embeddings = model.vectors.tolist() self.vocab = model.vocab.tolist() self.wordsMap = self._build(self.vocab)
def test_similar(): model = word2vec.load(output_bin) indexes, metrics = model.similar("the") assert indexes.shape == (10,) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() print(py_response) assert len(py_response) == 10 assert len(py_response[0]) == 2
def emb2npz(emb_file_path, emb_dict_path): """将txt格式的embedding转为字典格式, 并将<PAD>和<UNK>加入""" emb = word2vec.load(emb_file_path) vec = emb.vectors word2id = emb.vocab_hash word2id['<PAD>'] = len(word2id) pad_row = [0] * vec.shape[1] vec = np.row_stack((vec, pad_row)) np.savez_compressed(emb_dict_path, vec=vec, word2id=word2id) print('word size: {}'.format(len(word2id))) print('emb shape: {}'.format(vec.shape))
def load_embedding(path): wv = word2vec.load(path) vocab = wv.vocab word2idx = {} word_embedding = wv.vectors for i in range(1, len(vocab) + 1): word2idx[vocab[i-1]] = i word2idx['<0>'] = 0 word_zero = np.zeros(len(word_embedding[0])) word_embedding = np.vstack([word_zero, word_embedding]) return word2idx, word_embedding
def build_glove_dic(): glove_path = 'glove.6B.50d.txt' wv = word2vec.load(glove_path) vocab = wv.vocab sr_word2id = pd.Series(range(1, len(vocab) + 1), index=vocab) sr_word2id['<unk>'] = 0 word_embedding = wv.vectors word_mean = np.mean(word_embedding, axis=0) word_embedding = np.vstack([word_mean, word_embedding]) return sr_word2id, word_embedding
def word_to_vec(config_path: str, dimension: int, T = ""): folder = "data/word2vec" words_file = os.path.join(folder, f"{T}words-noisefiltered-{dimension}") phrases_file = os.path.join(folder, f"{T}phrases-noisefiltered-{dimension}") w2v_file = os.path.join(folder, f"{T}noisefiltered-{dimension}.bin") import word2vec word2vec.word2phrase(words_file, phrases_file, verbose=True) word2vec.word2vec(phrases_file, w2v_file, size=dimension) logging.info("wrote to " + w2v_file) return word2vec.load(w2v_file)
def tran(path): model = word2vec.load(path) vocab, vectors = model.vocab, model.vectors print(path) print('shape of word embeddings: ', vectors.shape) new_path = path.split('.')[0] + '.txt' print('transform start...') with open(new_path, "w") as f: for word, vector in tqdm(zip(vocab, vectors)): f.write(str(word) + ' ' + ' '.join(map(str, vector)) + '\n') print('Transform Complete!\n')
def train(): movie_set = cornell_movie_set.MovieSet() movie_set.parse_movie_set('train') word2vec.word2phrase('cornell_movie_train.txt', 'movie_phrases_train.txt', verbose=True) word2vec.word2vec('movie_phrases_train.txt', 'movie_train.bin', size=100, verbose=True) model = word2vec.load('movie_train.bin') return model
def create_model(): in_file = open(sys.argv[1]) out_file = open(sys.argv[2],"w") json_data = json.load(in_file) final_hash = {} model = word2vec.load(sys.argv[3]) clusters = word2vec.load_clusters(sys.argv[4]) for loc in json_data: count = 0 keywords = [] final_hash[loc] = {} final_hash[loc]["doc_length"] = json_data[loc]["len"] final_hash[loc]["keywords"] = [] final_hash[loc]["centroids"] = [] word_vectors = {} #"word" => [vector] word_clusters = {} #"cluster_no" => [words] cluster_centroids = {} for word in json_data[loc]["keywords"]: if len(word.split()) > 1: continue count += 1 try: vec = model[word] cluster_no = clusters[word] except KeyError: #print("No entry in word2vec for " + word) continue word_vectors[word] = vec if cluster_no not in word_clusters: word_clusters[cluster_no] = [] cluster_centroids[cluster_no] = len(vec)*[0.0] word_clusters[cluster_no].append(word) for i in range(len(vec)): cluster_centroids[cluster_no][i] += word_vectors[word][i] for cluster_no in word_clusters: cluster_len = len(word_clusters[cluster_no]) for i in range(len(cluster_centroids[cluster_no])): cluster_centroids[cluster_no][i] = cluster_centroids[cluster_no][i] / cluster_len for cluster_no in word_clusters: keys = [] for word in word_clusters[cluster_no]: keys.append((word,json_data[loc]["keywords"][word])) final_hash[loc]["keywords"].append(keys) final_hash[loc]["centroids"].append(cluster_centroids[cluster_no]) #print(" Total keywords in " + loc + " : " + str(count)) #print(" Total word vectors in " + loc + " : " + str(len(word_vectors))) json.dump(final_hash,out_file)
def load_data(): papers = [] filename = sys.argv[1] file = open(filename, 'r') for line in file.readlines(): dic = json.loads(line) papers.append(dic) word_embedding = word2vec.load( 'word2vec_result.bin') #导入预训练好的word2vec embedding print('load data finished') return papers, word_embedding
def test_model_with_clusters(): clusters = word2vec.load_clusters(output_clusters) model = word2vec.load(output_txt) assert clusters.vocab.shape == model.vocab.shape model.clusters = clusters indexes, metrics = model.analogy(pos=["the", "the"], neg=["the"], n=30) assert indexes.shape == (30, ) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() assert len(py_response) == 30 assert len(py_response[0]) == 3
def init(path_to_we_model, path_to_relations): st = time.time() we_model = word2vec.load(path_to_we_model) et = time.time() we_loading_time = et - st st = time.time() relational_embedding = composition.compose_dataset(path_to_relations, we_model) et = time.time() relemb_build_time = et - st api = Fabric(we_model, relational_embedding, path_to_relations) print("Time to load WE model: " + str(we_loading_time)) print("Time to build relemb: " + str(relemb_build_time)) return api
def getanology(second, first, third): import word2vec # Import the word2vec binary file: dataset model = word2vec.load('/export/home/sysadmin/text8.bin') # We can do simple queries to retreive words related to "word" indexes, metrics = model.analogy(pos=[first, third], neg=[second], n=10) #model.vocab[indexes] related_word = model.vocab[indexes[0]] return related_word
def embedding_transform(emb_file): model = word2vec.load(emb_file) vocab, vectors = model.vocab, model.vectors print(emb_file) print('setting size of word embedding: {0}'.format(vectors.shape)) new_file = emb_file.split('.')[0] + '_.txt' print('Transforming.....') with open(new_file, 'w') as f: for word, vec in zip(vocab, vectors): f.write(str(word) + ' ' + ' '.join(map(str, vec)) + '\n') print('fransform finished.')
def test_model_with_clusters(): clusters = word2vec.load_clusters(output_clusters) model = word2vec.load(output_bin) assert clusters.vocab.shape == model.vocab.shape model.clusters = clusters indexes, metrics = model.analogy(pos=["the", "the"], neg=["the"], n=30) assert indexes.shape == (30,) assert indexes.shape == metrics.shape py_response = model.generate_response(indexes, metrics).tolist() assert len(py_response) == 30 assert len(py_response[0]) == 3
def cosine_all(): ''' Use model to cosine all name in the book. ''' _name_list = name_list[:] _name_list = map(lambda _: _.encode('utf-8'), _name_list) model = word2vec.load('../tmp/book.bin') _ret = {} for _ in _name_list: try: _ret.update(model.cosine(_, n=10)) except: print _ + ' not found' return _ret
def create_voabulary_labelO(): model = word2vec.load('zhihu-word2vec-multilabel.bin-100', kind='bin') #zhihu-word2vec.bin-100 count=0 vocabulary_word2index_label={} vocabulary_index2word_label={} label_unique={} for i,vocab in enumerate(model.vocab): if '__label__' in vocab: #'__label__-2051131023989903826 label=vocab[vocab.index('__label__')+len('__label__'):] if label_unique.get(label,None) is None: #不曾出现过的话,保持到字典中 vocabulary_word2index_label[label]=count vocabulary_index2word_label[count]=label #ADD count=count+1 label_unique[label]=label return vocabulary_word2index_label,vocabulary_index2word_label
def matcher(line, context): model = word2vec.load('../lib/word2vec/vectors.bin') #clusters = word2vec.load_clusters('../lib/word2vec/text8-clusters.txt') a = numpy.loadtxt('latent_model.txt') with open('latent_tags.txt') as f: b = f.readlines() a = model['sports'] print a b = model['sporting'] print b result = 1 - spatial.distance.cosine(a, b) print result return 'jankiap50'
def genDB(): global model con, cur = createDB() for corp in corpus: f, TBNAME = corp print f, TBNAME model = word2vec.load(word2vec_model[TBNAME]) d, m = readPatterns(f) json.dump(m, open('syn_%s.json'%TBNAME,'w+')) for word, data in d.iteritems(): #print '.', word, TBNAME cur.execute("""INSERT INTO %s (word, data) VALUES('%s','%s');"""%(TBNAME, word.replace("'","''"), json.dumps(data).replace("'","''"))) con.commit() #cur.execute("SELECT * FROM test;") cur.close() con.close()
def calculate_similarity(query, text, model_path): embeddingSize = 300 query_embedding =np.zeros((1,embeddingSize)) stop = stopwords.get_stopwords('english') model = word2vec.load(model_path) query_embedding = get_embedding(query, model, stop, query_embedding) nword=0 score = 0.0 for word in nltk.tokenize.word_tokenize(text.decode('utf8')): if word in model and word not in stop: nword += 1 wordNorm = linalg.norm(model[word]) score += np.dot(query_embedding, model[word]) / wordNorm if nword!=0: score = score / nword print score[0] return score[0]
def inject_word2vec_embeddings_old(session, word2vec_path, input_size, dict_dir, source_vocab_size, target_vocab_size): # (100000, 300) word2vec_model = word2vec.load(word2vec_path, encoding="latin-1") # automatically detects format print("w2v model created!") source_vocab_path, target_vocab_path = get_source_target_vocab_path(dict_dir, source_vocab_size, target_vocab_size) w2v_vectors_source = get_w2v_pretrained_vectors(word2vec_model, source_vocab_path, source_vocab_size, input_size) w2v_vectors_target = get_w2v_pretrained_vectors(word2vec_model, target_vocab_path, target_vocab_size, input_size) print("pre-trained source shape " + str(w2v_vectors_source.shape)) print(w2v_vectors_source) print(w2v_vectors_source.shape) # (vocab_size, embedding_dim) with tf.variable_scope("embedding_attention_seq2seq"): with tf.variable_scope("RNN"): with tf.variable_scope("EmbeddingWrapper", reuse=True): # 1) getting Variable containing embeddings embedding = vs.get_variable("embedding", w2v_vectors_source.shape, trainable=False) # 2) using placeholder to assign embedding X = tf.placeholder(tf.float32, shape=w2v_vectors_source.shape) # model.vectors.shape set_x = embedding.assign(X) session.run(tf.initialize_all_variables()) session.run(set_x, feed_dict={X: w2v_vectors_source}) v = session.run(embedding) print("After pre-trained") print(v) # embedding_attention_decoder | embedding_attention_seq2seq/embedding_attention_decoder/embedding:0 with tf.variable_scope("embedding_attention_seq2seq"): with tf.variable_scope("embedding_attention_decoder", reuse=True): decoder_embedding = vs.get_variable("embedding", w2v_vectors_target.shape, trainable=False) # 2) using placeholder to assign embedding X = tf.placeholder(tf.float32, shape=w2v_vectors_target.shape) # model.vectors.shape set_x = decoder_embedding.assign(X) session.run(tf.initialize_all_variables()) session.run(set_x, feed_dict={X: w2v_vectors_target}) v = session.run(decoder_embedding) print("After pre-trained") print(v)
def load_model(desc, tfidf_doc='split_plot', tfidf_wthr=1): """Load appropriate model based on descriptor type. """ model = None if desc.startswith('tfidf'): model = encode_tfidf_model(tfidf_doc, tfidf_wthr) desc = desc + '-' + tfidf_doc + '-' + str(tfidf_wthr) elif desc == 'word2vec': model = w2v.load('models/movie_plots_1364.d-300.mc1.w2v', kind='bin') elif desc == 'skipthought': model = skipthoughts.load_model() elif desc == 'vis-text-embed': raise ValueError('Visual-Text embeddings are not yet supported.') # model = VisTextEncoder() return model, desc
def main(): # model = word2vec.load('/home/lr/sasano/corpus/word2vec/jawiki-mecab.bin') model = word2vec.load('/home/lr/tsakaki/work/word2vec/w2v_jawiki_latest_mecab_baseform_s300_w500.bin') for line in sys.stdin: line = line.rstrip() lst = line.split('\t') arg_pred0 = lst[0] arg_pred1 = lst[1] clas = lst[2] clas_detail = lst[3] if len(lst) >= 4 else "" arg = cut_arg_pred(arg_pred0)[0] pred0 = cut_arg_pred(arg_pred0)[1] pred1 = cut_arg_pred(arg_pred1)[1] ans_vector = [] try: ans_vector.extend(model[arg]) pred0_vec = model[pred0] pred1_vec = model[pred1] diff_vec = [] for i in xrange(0, len(pred0_vec)): diff_vec.append(pred0_vec[i] - pred1_vec[i]) ans_vector.extend(diff_vec) except: continue label = 1 if (clas == "反義" and clas_detail == "属性反義") else -1 # print arg # print pred0 # print pred1 # print label # sys.exit(1) sys.stdout.write("%d " % label) for ind, val in enumerate(ans_vector): sys.stdout.write("%d:%f " % (ind+1, val)) print return
def active_learn_main(engine, initial_term, user_id, concept_id=False): ''' engine is initial_term is user_id is concept_id is ''' #user will select a term and then the term will be run through the word2vec model to come up with similar terms #if it is an existing concept pull the existing data from db else start from scratch if concept_id: term_list = engine.execute(select([ConceptTerms.c.term]).where(ConceptTerms.c.concept_id == concept_id)) term_exc = engine.execute(select([ConceptTerms_reject.c.term]).where(ConceptTerms_reject.c.concept_id == concept_id)) pred_list = engine.execute(select([ConceptPredictors.c.predictor]).where(ConceptPredictors.c.concept_id == concept_id)) pred_exc = engine.execute(select([ConceptPredictorsReject.c.predictor]).where(ConceptPredictorsReject.c.concept_id == concept_id)) else: term_list = set([initial_term]) term_exc = set() pred_list = set() pred_exc = set() #load in model #model = word2vec.load('/groups/clinicaltrials/clinicaltrials/data/criteria.bin') #clusters = word2vec.load_clusters('/groups/clinicaltrials/clinicaltrials/data/criteria-clusters.txt') model = word2vec.load('../data/criteria.bin') clusters = word2vec.load_clusters('../data/criteria-clusters.txt') # add clusters to model model.clusters = clusters #add skip terms to term_exc and pred_exc skip_term, skip_pred = skip_terms() term_exc.update(skip_term) pred_exc.update(skip_pred) term_list, pred_list = run_active_learning(term_list, term_exc, pred_list, pred_exc, engine, concept_id, user_id, model)