def loadWord2VecAndVectorizeInputs(X_train, X_test, word2vecURI): #load Word2Vec model w2v_model = Word2VecKeyedVectors.load_word2vec_format(word2vecURI, binary=False) print("vocab_size = %s", len(w2v_model.vocab)) #determine number of features for each word in the model WORD2VEC_NO_OF_FEATURES = w2v_model['dog'].shape[0] print("num_features = ", WORD2VEC_NO_OF_FEATURES) print("len(X_train) = ", len(X_train)) print("len(X_test) = ", len(X_test)) #define the missing word vector empty_word = np.zeros(WORD2VEC_NO_OF_FEATURES, dtype=float) #create the list to get the all words which we are missing in the Word2Vec model missedWords = [] word2index = {} #vectorize each input X_train_vectorized = vectorizeInput(X_train, w2v_model, empty_word, missedWords, word2index) X_test_vectorized = vectorizeInput(X_test, w2v_model, empty_word, missedWords, word2index) print("Number of used words = ", len(set(word2index))) print("Number of words missing = ", len(set(missedWords))) return X_train_vectorized, X_test_vectorized, w2v_model, word2index
def tokenizeLemmatizeDataSet(X_train, X_test, word2vecURI): lemmatizer, tokenizer, stop_words = initTokenizers() #load Word2Vec model w2v_model = Word2VecKeyedVectors.load_word2vec_format(word2vecURI, binary=False) print("vocab_size = %s", len(w2v_model.vocab)) #determine number of features for each word in the model WORD2VEC_NO_OF_FEATURES = w2v_model['dog'].shape[0] print("num_features = ", WORD2VEC_NO_OF_FEATURES) print("len(X_train) = ", len(X_train)) print("len(X_test) = ", len(X_test)) #create the list to get the all words which we are missing in the Word2Vec model missedWords = [] word2index = {} X_train = prepareDataSet(lemmatizer, tokenizer, stop_words, X_train, missedWords, word2index, w2v_model) X_test = prepareDataSet(lemmatizer, tokenizer, stop_words, X_test, missedWords, word2index, w2v_model) print("Number of used words = ", len(set(word2index))) print("Number of words missing = ", len(set(missedWords))) return X_train, X_test, w2v_model, word2index
def load_wv(url): """ load KeyedVectors wv Args: url: url to wv file Returns: Word2VecKeyedVectors: wv """ return Word2VecKeyedVectors.load_word2vec_format(url, binary=False)
def readorg(self): dic = {} dishnames = DataLoader().load_dish_name() basicwords = self.readcorpus() regionnames = self.readregion() embedding_text = Word2VecKeyedVectors.load_word2vec_format(self.txtfilepath, binary=False) model = embedding_text for word in basicwords: if word in model.wv.vocab.keys(): dic[word] = model[word] a = 0 temp =[] #print(dic[word]) #print(dic[word][a]) while a < 200: temp.append(float(dic[word][a])) a +=1 dic[word] = temp for dishname in dishnames: for name in dishname: if name in model.wv.vocab.keys(): dic[name] = model[name] a = 0 temp = [] while a < 200: temp.append(float(dic[name][a])) a +=1 dic[name] = temp for regionname in regionnames: for name in regionname: if name in model.wv.vocab.keys(): dic[name] = model[name] a = 0 temp = [] while a < 200: temp.append(float(dic[name][a])) a +=1 dic[name] = temp f = open('./data/ChineseFoodEmbedding.txt', 'w', encoding='utf-8') index=0 while index < 10: print(dic[basicwords[index]]) index += 1 for key in dic.keys(): pattern = re.compile(r'[\[\]\n\r\t]') f.write(key+" "+re.sub(pattern, "", str(dic[key]))+'\n') f.close()
def from_pretrained( cls, # type: ignore embed_type: str, embed_path: Path, word_vocab: Vocabulary, embed_dim: int, freeze=True, sparse=False) -> 'Embedding': r""" Creates an :class:`Embedding` instance from external pretrained embeddings. :param embed_type: Type of the embedding, can be ``word2vec`` or ``fasttext``. :param embed_path: Path to the embedding file. :param word_vocab: A vocabulary mapping words to indices. :param embed_dim: Dimension of embeddings. :param freeze: If ``True``, embeddings are fixed during training. :param sparse: If ``True``, sparse embeddings are used. See PyTorch documentation for details. """ embed_path_str = str(embed_path.resolve()) if embed_type == 'word2vec': from gensim.models.word2vec import Word2VecKeyedVectors model = Word2VecKeyedVectors.load(embed_path_str) elif embed_type == 'fasttext': from gensim.models.fasttext import FastTextKeyedVectors model = FastTextKeyedVectors.load(embed_path_str) elif embed_type == 'glove': raise NotImplementedError else: raise ValueError(f"Embedding type {embed_type} not supported.") assert model.vector_size == embed_dim embeddings = np.zeros((len(word_vocab), embed_dim)) for word, idx in word_vocab.items(): embeddings[idx] = model.get_vector(word) embedding = super().from_pretrained(embeddings, freeze=freeze, sparse=sparse) # no point in doing the following: `cls` in classmethod points to the subclass # embedding.forward = types.MethodType(cls.forward, embedding) return embedding
def __init__(self, weights : str = None, size : int=100, window : int=5, min_count : int=1, normalize : str=None, dictionary=None, batch_size : int=10, **kwargs): self.dictionary = dictionary self.window = window if weights: self.obj = Word2VecKeyedVectors.load_word2vec_format(weights, binary=True) else: super(Word2VecWrapper, self).__init__( size=size, window=self.window, min_count=min_count, **kwargs) if self.dictionary: self.obj.build_vocab([[v for v in self.dictionary.values()]]) self.normalize = normalize self.batch_size = batch_size
from gensim.models.word2vec import Word2VecKeyedVectors wv = Word2VecKeyedVectors.load_word2vec_format( "/hd/tecent_ew/Tencent_AILab_ChineseEmbedding.txt", binary=False) kw = "电话号码" print kw, "/".join([word for word, sim in wv.most_similar(kw, topn=10)]) #https://www.cnblogs.com/bymo/p/8440722.html ''' >>> kw = u"电话号码" >>> print kw, "/".join([word for word, sim in wv.most_similar(kw, topn=10)]) /usr/local/lib/python2.7/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int): 电话号码 手机号码/手机号/电话号/座机号码/你的电话号码/联系号码/几个电话号码/座机号/电话/新号码 >>> print kw, "/".join([word for word, sim in wv.most_similar(kw, topn=100)]) 电话号码 手机号码/手机号/电话号/座机号码/你的电话号码/联系号码/几个电话号码/座机号/电话/新号码/电话拨打/我的电话/私人号码/联系方式/家里的电话号码/电话信息/个人电话/手机通讯录/通话记录/手机电话/联系人电话/你的手机号/qq号码/家庭地址/办公电话/银行账号/电话记录/你的电话/银行卡号/联络方式/固定电话号码/电话本/短信内容/电话簿/电子邮箱地址/家庭电话/移动电话号码/通讯号码/号码显示/私人手机/父母的电话/姓名地址/卡号/办公室电话/短信/住址/私人电话/银行卡号码/电话薄/拨通/电子邮件地址/陌生号码/号码/qq号/其他联系方式/一串数字/住宅电话/显示号码/两个电话/回拨电话/短信信息/单位电话/手机联系人/家庭号码/拨通电话/联系电话号码/通信录/微信号码/通讯录/家庭住址/联系人姓名/联系人的姓名/常用号码/电话和短信/银行卡账号/手机拨打/短信息/拨打/座机电话号码/详细住址/银行卡密码/银行卡卡号/手机电话号码/办公室号码/信用卡号/座机电话/身份证信息/通讯记录/公用电话/通讯簿/留电话/打电话/收到的短信/空号/网络联系方式/骚扰电话/查号码/联络电话/电话通讯录/email地址 >>> kw = u"短信" >>> print kw, "/".join([word for word, sim in wv.most_similar(kw, topn=100)]) 短信 短息/短信息/一条短信/短信内容/电话和短信/手机短信/微信信息/微信消息/陌生号码/短信里/收到短信/短信回复/诈骗短信/陌生短信/通知短信/短信发送/群发短信/qq信息/打开短信/短信提醒/验证码短信/匿名短信/短信信息/qq消息/短消息/回复短信/垃圾短信/一条信息/发信息/骚扰短信/我的短信/手机号码/广告短信/信息回复/电话/语音留言/发送短信/收到的短信/回复信息/发短息/10086/问候短信/扣费短信/电话信息/群发/群发信息/qq留言/祝福短信/短信轰炸/语音电话/微信提醒/祝福信息/邮件/手机信息/微信短信/条微信/发短信/彩信/语音消息/短信提示/电话号码/那条短信/骗子短信/语音信息/信息提醒/转账信息/手机号/推送消息/看短信/骚扰电话/所有短信/短讯/电子邮件/以及短信/电话短信/我的电话/打电话/短信或电话/发消息/手机短信息/通讯录好友/发送失败/短信通知/短信电话/手机里/一则短信/陌生来电/微信发/陌生电话/道歉短信/接到的电话/短信消息/电话或短信/诈骗电话/回短信/未知号码/短信显示/未读信息/诈骗信息/新号码 '''
# -*- coding:UTF-8 -*- """ @File : test_wv.py @Time : 2019/4/17 22:21 @Author : Blue Keroro """ from gensim.models.word2vec import Word2VecKeyedVectors if __name__ == '__main__': from time import time start = time() print('加载词向量') wv_from_text = Word2VecKeyedVectors.load_word2vec_format( 'C:/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt', binary=False) print('加载词向量 完毕') with open('models/nerDict.txt', 'r', encoding='utf-8') as f1, \ open('models/nerDict_tencent_error.txt', 'w', encoding='utf-8') as f2: for line in f1: line = line.strip() try: if line not in wv_from_text: f2.write(line + '\n') except Exception as e: print('出现问题', 'line:', line, 'error:', e) print('end', time() - start)
def __init__(self): self.wv = Word2VecKeyedVectors.load( '/Users/linjliang/Learning/PROJECT/workspace/functional/Tencent_AILab_smallEmbedding/1M.bin', mmap='r')
from gensim.models.word2vec import Word2VecKeyedVectors file = r"F:\NLP_learnings\词向量\腾讯中文词向量\Tencent_AILab_ChineseEmbedding.tar.gz" wv_from_text = Word2VecKeyedVectors.load_word2vec_format(file, binary=False, encoding='gbk') print(wv_from_text)
def __init__(self, path): self.model = Word2VecKeyedVectors.load_word2vec_format(path, binary=False)
def load_model(filepath): return Word2VecKeyedVectors.load(filepath)
def read(self): embedding_text = Word2VecKeyedVectors.load_word2vec_format(self.foodfilepath, binary=False) return embedding_text