with open(r'D:/in_the_name_of_people_segment.txt', 'wb+') as f2: f2.write(result) f.close() f2.close() import logging import os from gensim.models import word2vec logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.LineSentence(r'D:/in_the_name_of_people_segment.txt') model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3, size=20) req_count = 5 for key in model.wv.similar_by_word('沙瑞金', topn=20): #20是设置每个词语提取向量的个数 if len(key[0]) == 3: req_count -= 1 print(key[0], key[1]) if req_count == 0: break end_time = time() run_time = end_time - begin_time print(run_time) #返回的时间单位是秒
train_data_features = vectorizer.fit_transform(questionList) train_data_features = train_data_features.toarray() # Get a list of all words from the feature list vocab = vectorizer.get_feature_names() # Sum the counts for each vocab word dist = np.sum(train_data_features, axis=0) num_features = 300 min_word_count = 30 num_workers = 4 context = 10 downsampling = 1e-3 model = word2vec.Word2Vec(questionList, workers=num_workers, \ size=num_features, min_count=min_word_count, \ window=context, sample=downsampling) # No more training, makes the model more memory friendly model.init_sims(replace=True) def makeFeatureVec(words, model, num_features): #preallocation of numpy array for speed purposes featureVec = np.zeros((num_features,), dtype="float32") nwords = 0 #Convert model vocabulary to set for speed index2word_set = set(model.index2word) for word in words: if word in index2word_set: nwords = nwords+1
def BuildSemanticModel(semantic_model_input_file, pretrained_input_file, use_pretrained_vectors=True, high_sd_cutoff=3, low_n_cutoff=1): """ Given an input file produced by the ALIGN Phase 1 functions, build a semantic model from all transcripts in all conversations in target corpus after removing high- and low-frequency words. High-frequency words are determined by a user-defined number of SDs over the mean (by default, `high_sd_cutoff=3`). Low-frequency words must appear over a specified number of raw occurrences (by default, `low_n_cutoff=1`). Frequency cutoffs can be removed by `high_sd_cutoff=None` and/or `low_n_cutoff=0`. """ # build vocabulary list from transcripts data1 = pd.read_csv(semantic_model_input_file, sep='\t', encoding='utf-8') # get frequency count of all included words all_sentences = [ re.sub('[^\w\s]+', '', str(row)).split(' ') for row in list(data1['lemma']) ] all_words = list([a for b in all_sentences for a in b]) frequency = defaultdict(int) for word in all_words: frequency[word] += 1 # remove words that only occur more frequently than our cutoff (defined in occurrences) frequency = { word: freq for word, freq in frequency.iteritems() if freq > low_n_cutoff } # if desired, remove high-frequency words (over user-defined SDs above mean) if high_sd_cutoff is None: contentWords = [word for word in frequency.keys()] else: getOut = np.mean(frequency.values()) + (np.std(frequency.values()) * (high_sd_cutoff)) contentWords = { word: freq for word, freq in frequency.iteritems() if freq < getOut }.keys() # decide whether to build semantic model from scratch or load in pretrained vectors if not use_pretrained_vectors: keepSentences = [[word for word in row if word in contentWords] for row in all_sentences] semantic_model = word2vec.Word2Vec(all_sentences, min_count=low_n_cutoff) else: if pretrained_input_file is None: raise ValueError( 'Error! Specify path to pretrained vector file using the `pretrained_input_file` argument.' ) else: semantic_model = gensim.models.KeyedVectors.load_word2vec_format( pretrained_input_file, binary=True) # return all the content words and the trained word vectors return contentWords, semantic_model.wv
is_test_train = np.zeros(N, dtype='int') is_test_test = np.ones(M, dtype='int') data_train['is_test'] = pd.Series(is_test_train, index=data_train.index) data_test['is_test'] = pd.Series(is_test_test, index=data_test.index) data_train_test = pd.concat([ data_train[['question1', 'question2', 'is_test']], data_test[['question1', 'question2', 'is_test']] ], axis=0) corpus = hr.build_corpus(data_train_test) print "Corpus creado" #Parti pris : word2vec está entrenado sobre el conjunto limpiado model = word2vec.Word2Vec(corpus, size=100, window=20, min_count=200, workers=4) model.save(path + 'mymodel') print "Model word2vec creado y guardado" del corpus #hint para aliviar la RAM df_RF_train = hr.clean_dataframe_after_building_model(data_train) df_RF_test = hr.clean_dataframe_after_building_model(data_test) del model df_RF_train.to_csv(path + 'df_RF_train.csv', index=False) df_RF_test.to_csv(path + 'df_RF_test.csv', index=False)
sentences1 = ['this is a sentence', 'this is the second sentence'] # 传入的正确格式 sentences2 = [['this', 'is', 'a', 'sentence'], ['this', 'is', 'the', 'second', 'sentence']] # 也可以是把文档中所有的词提取出来的长列表,但是要保证它是二维的 # 无需去重,因为Word2Vec建议训练模型时把词频考虑在内 sentences3 = [[ 'this', 'is', 'a', 'sentence', 'this', 'is', 'the', 'second', 'sentence' ]] # 模型构建 model = word2vec.Word2Vec(sentences2, sg=1, size=20, window=1, min_count=1, negative=3, sample=0.001, hs=1, workers=4) ''' Word2Vec参数说明 1.sentences:可以是一个List,对于大语料集,建议使用BrownCorpus,Text8Corpus或LineSentence构建。 2.sg: 用于设置训练算法,默认为0,对应CBOW算法;sg=1则采用skip-gram算法。 3.size:是指输出的词的向量维数,默认为100。大的size需要更多的训练数据,但是效果会更好. 推荐值为几十到几百。 4.window:为训练的窗口大小,8表示每个词考虑前8个词与后8个词(实际代码中还有一个随机选窗口的过程,窗口大小<=5),默认值为5。 5.alpha: 是学习速率 6.seed:用于随机数发生器。与初始化词向量有关。 7.min_count: 可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5。 8.max_vocab_size: 设置词向量构建期间的RAM限制。如果所有独立单词个数超过这个,则就消除掉其中最不频繁的一个。每一千万个单词需要大约1GB的RAM。设置成None则没有限制。 9.sample: 表示 采样的阈值,如果一个词在训练样本中出现的频率越大,那么就越会被采样。默认为1e-3,范围是(0,1e-5)
#이중 리스트에 split # Word2vec에 알맞은 형태로 넣어주기 위해 product_names_word2vec = list() for i in product_names_cleansing: a = i.split(" ") product_names_word2vec.append(a) embedding_size = 300 #최대 40000개의 단어를 사용하여 사전구성 min_count = 1 # 최소 n번 이상 나온 단어만 사용 max_sentence_length = 38 #문장의 최대길이를 50 # Word2Vec Embedding w2v_model = word2vec.Word2Vec(product_names_word2vec, size=embedding_size, min_count=min_count) print(w2v_model) # 유사단어 # w2v_model.wv.most_similar("BANANA") # word2vec 벡터값 w2v_weight = w2v_model.wv.vectors w2v_weight.shape # 단어 {Key:Value} Dictionary 처리 및 기존에 없는 단어가 들어올 때를 위한 처리 index2word = {i + 2: w for i, w in enumerate(w2v_model.wv.index2word)} index2word[0] = 'PAD' index2word[1] = 'UNK' word2index = {w: i for i, w in index2word.items()}
''' Building Word2Vector Model for Word Embeddings ''' print("Building Word2Vec model..") # Initialize Model Building timer start = time.time() # Check if a Word2Vec Model name is specified if(Word2VecModelName): # Load a locally saved model v2wmodel = Word2Vec.load(Word2VecModelName) else: # Building the Word2Vec Model with the specified parameters v2wmodel = word2vec.Word2Vec(training_sentences, size=vector_dimensions, window=window_size, min_count=min_word_count, workers=number_of_workers) # Save Word2Vec model with specified Name v2wmodel.save("Word2VecModel") # End and display time to build Word2Vec Model print("Model built in : ", time.time()-start,"s.\n") ''' Embedding of Train Vectors ''' print("Creating Embedded Train Vectors..") start = time.time()
tradeEnglish = pd.Series(data.loc[:, "Trade_English"]) # %% prepare for word2vec tradeEnglish2 = tradeEnglish.apply(lambda x: [" ".join(x).strip()]) tradeEnglish3 = [] for item in tradeEnglish2: tradeEnglish3.append("".join(item)) pd.Series(tradeEnglish3).to_csv(dataPath + "sentences.csv", header=0, index=0) # %% w2v import logging logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO) from gensim.models import word2vec sentences = word2vec.LineSentence(dataPath + "sentences.csv") model = word2vec.Word2Vec(sentences, min_count=10, workers=4, size=300, window=5, iter=50) model.save(dataPath + "word2vec.model")
from gensim.models import word2vec import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus('./test.txt') model = word2vec.Word2Vec(sentences, size=200, min_count=20, window=15) model.save("./test.model")
# 读取训练数据。先转换成Corpus形式 from gensim.models import word2vec from gensim.models import KeyedVectors, Word2Vec sentences = [] with open("poem_for_embedding.txt") as f: for line in f.readlines(): sentences.append(line.replace("\n", "").split(" ")) dim = 128 window = 5 min_count = 5 model = word2vec.Word2Vec(sentences, size=dim, window=window, min_count=min_count, workers=4) model.save(f"vocab/w2v_{dim}.txt") print(model.wv.most_similar("我", topn=10)) print(model.wv["我"])
#!/usr/bin/env python # -*- coding: utf-8 -*- from gensim.models import word2vec import logging # 主程序 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus(r"C:\Users\CJ17\Desktop\text8") model = word2vec.Word2Vec(sentences, sg=1, size=200, window=10, min_count=5, negative=10) model.save(r"C:\Users\CJ17\Desktop\text8output\200dimension\text8.model") model.wv.save_word2vec_format( r"C:\Users\CJ17\Desktop\text8output\200dimension\text8.model.vector") y1 = model.similarity("woman", "man") print(u"woman and man :", y1) ''' model.save("text8.model")# add the path into # 对应的加载方式 # model_2 = word2vec.Word2Vec.load("text8.model") model.save_word2vec_format("text8.model.bin", binary=True) # 对应的加载方式 # model_3 = word2vec.Word2Vec.load_word2vec_format("text8.model.bin", binary=True) '''
#-*-coding:utf-8-*- from gensim.models import word2vec from gensim.models import word2vec as LineSentence import logging import sys reload(sys) sys.setdefaultencoding('utf8') if __name__ == "__main__": logging.basicConfig(format='%(asctime)s:%(levelname)s:(message)s', level=logging.INFO) txt = word2vec.Text8Corpus(u'train.data.model') model = word2vec.Word2Vec(txt, size=100, window=50, min_count=1) model.save('word2vec.model')
cleantext = " ".join(tokens) nlp = spacy.load('en_core_web_sm') # make sure to use larger model! doc = nlp(cleantext) list_of_lists = [] for sentence in doc.sents: inner_list = [] for token in sentence: inner_list.append(token.text) list_of_lists.append(inner_list) model = word2vec.Word2Vec(list_of_lists, size=200, window=5, min_count=4, workers=4) def tsne_plot(doc, myperplexity, title): "Creates and TSNE model and plots it" labels = [] tokens = [] #model.wv('trump') accesses the word vector for the word 'trump. #vocab is the list of words for word in model.wv.vocab: #model[word] is the matrix (word vector) of the word
sentences = [] for raw_sentence in raw_sentences: if len(raw_sentence) > 0: sentences.append(word_list(raw_sentence)) token_count = sum([len(sentence) for sentence in sentences]) print("\nToken count = {:,}".format(token_count)) # Build Word2Vec model num_features = 300 min_word_count = 3 num_workers = multiprocessing.cpu_count() context_size = 7 thrones2vec = w2v.Word2Vec(sg=1, size=num_features, min_count=min_word_count, window=context_size, workers=num_workers) thrones2vec.build_vocab(sentences) # # Train the model # thrones2vec.train(sentences) # if not os.path.exists(model_path): # os.mkdir(model_path) # thrones2vec.save(model_name) thrones2vec = w2v.Word2Vec.load(model_name) man_sim = thrones2vec.most_similar(positive=['man', 'woman'], negative=['girl'], topn=1) print(man_sim)
def my_word2vec(cut_filename): mysetence = word2vec.Text8Corpus(cut_filename) # model = word2vec.Word2Vec(mysetence, size=300, min_count=1, window=5, hs=5) model = word2vec.Word2Vec(mysetence, size=100, min_count=1, window=5, hs=5) model.save('./model/zh_wiki_global.model') return model
def learn_embeddings(walks, output, dimensions=100, window_size=5, min_count=0, sg=1, iterations=3, alpha=.1, min_alpha=.01, workers=4): model = word2vec.Word2Vec(sentences=walks, size=dimensions, window=window_size, min_count=min_count, sg=sg, workers=workers, iter=iterations, alpha=alpha, min_alpha=min_alpha) model.wv.save_word2vec_format(output)
twitter = Twitter() results = [] lines = text.split("\r\n") for line in lines: # 형태소 분석하기 --- (※3) # 단어의 기본형 사용 malist = twitter.pos(line, norm=True, stem=True) r = [] for word in malist: # 어미/조사/구두점 등은 대상에서 제외 if not word[1] in ["Josa", "Eomi", "Punctuation"]: r.append(word[0]) rl = (" ".join(r)).strip() results.append(rl) print(rl) # 파일로 출력하기 --- (※4) wakati_file = 'yesterday.model' with open(wakati_file, 'w', encoding='utf-8') as fp: fp.write("\n".join(results)) # Word2Vec 모델 만들기 --- (※5) # LineSentence 함수로 텍스트 파일을 읽어 들인다. data = word2vec.LineSentence(wakati_file) model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1) # 모델을 저장한다. model.save("yesterday.model") print("ok")
def train_word2vec(inputFile, modelFile): sentences = word2vec.LineSentence(inputFile) model = word2vec.Word2Vec(sentences, size=300, min_count=1, sg=1) model.save(modelFile)
from gensim.models import word2vec import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus('text8') model = word2vec.Word2Vec(sentences, size=200)
###############################################################r mpg = MetaPathGenerator() mpg.read_data("gene") ############################################################### ##论文关系表征向量 ############################################################### all_embs = [] rw_num = 10 cp = set() for k in range(rw_num): mpg.generate_WMRW("gene/RW.txt", 5, 20) sentences = word2vec.Text8Corpus(r'gene/RW.txt') model = word2vec.Word2Vec(sentences, size=100, negative=25, min_count=1, window=10) embs = [] for i, pid in enumerate(pubs): if pid in model: embs.append(model[pid]) else: cp.add(i) embs.append(np.zeros(100)) all_embs.append(embs) all_embs = np.array(all_embs) print(cp) ############################################################### ##论文语义表征向量
def word_2_vec(path): sentences = word2vec.LineSentence(path) model = word2vec.Word2Vec(sentences, size=300, min_count=20, window=5) return model
context_size = 7 #downloading setting for frequent words #0 - le-5 is good for this downsampling = 1e-3 #seed for the RNG, to make the results reproducible #random number generator #deterministic, good for debugging seed = 1 thrones2vec = w2v.Word2Vec( sg = 1, seed = seed, workers = num_workers, size = num_features, min_count = min_word_count, window = context_size, sample = downsampling ) thrones2vec.build_vocab(sentences) print("Word2Vec vocabulary lenght:", len(thrones2vec.wv.vocab)) thrones2vec.train(sentences) if not os.path.exists("trained"): os.makedirs("trained")
# -*- coding: utf-8 -*- from gensim.models import word2vec import sys args = sys.argv data = word2vec.Text8Corpus(args[1]+'.txt') model = word2vec.Word2Vec(data, size=200,min_count=1) model.save(args[1]+".model")
sentences += review_to_sentences(review, tokenizer) import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) num_features = 50 min_word_count = 1 num_workers = 4 context = 10 downsampling = 1e-3 from gensim.models import word2vec print("Training model...") model = word2vec.Word2Vec(sentences, workers=num_workers, \ size=num_features, min_count = min_word_count, \ window = context, sample = downsampling,sg=0) model.init_sims(replace=True) model_name = "50features_1minwords_1000context" model.save(model_name) def makeFeatureVec(words, model, num_features): featureVec = np.zeros((num_features, ), dtype="float32") nwords = 0. index2word_set = set(model.wv.index2word) for word in words:
def generate_word2vec(): s = word2vec.LineSentence(novel_seg_path) model = word2vec.Word2Vec(s, size=20, window=5, min_count=5, workers=4) model.save(novel_wzv_path) return model
line.replace('\t', '').replace('\n', '').replace(' ', '') seg_list = jieba.cut(line, cut_all=False, HMM=True) f2.write(" ".join(seg_list)) f1.close f2.close #####训练模型 from gensim.models import word2vec import logging #主程序 logging.basicConfig(format='%(asctime)s:%(levelname)s: %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus(u"Word2vec_jieba.txt") model = word2vec.Word2Vec(sentences, size=50) #训练skip-gram模型,默认window=5 print(model) #1、计算两个词的相似度/相关程度 try: y1 = model.similarity("阿里", "万达") except: y1 = 0 print("【国家】和【国务院】的相似度为:%s" % y1) print("-----\n") #2、计算某个词的相关词列表 y2 = model.most_similar("阿里", topn=30) # 20个最相关的 print("【阿里】最相关的词有:\r\n") for item in y2: print(item[0], item[1])
from gensim.models import word2vec # 코퍼스 읽어 들이기 --- (※ 1) sentences = word2vec.Text8Corpus('./wiki_wakati.txt') # 모델 만들기 --- (※ 2) model = word2vec.Word2Vec(sentences, sg=1, size=100, window=5) # 모델 저장하기 --- (※ 3) model.save("./wiki.model")
if line == '\n': continue temp = line.replace('\n', '').split('\t') temp[1] = ''.join(temp[1].split()) temp[1] = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()~-]+|[A-Za-z0-9]+", "", temp[1]) # sentences.append(temp[1]) sentences.append(jieba.lcut(temp[1])) return sentences sentences_train = read(trainDataSource) sentences_train_validation = read(trainDataSource) + read(validationDataSource) embeddingSize = 300 miniFreq = 1 word2VecModel_1 = word2vec.Word2Vec(sentences = sentences_train, size = embeddingSize, min_count = miniFreq, window = 10, workers = multiprocessing.cpu_count(), sg = 0, iter = 20) word2VecModel_1.save('word2VecModel_1') word2VecModel_2 = word2vec.Word2Vec(sentences = sentences_train_validation, size = embeddingSize, min_count = miniFreq, window = 10, workers = multiprocessing.cpu_count(), sg = 0, iter = 20) word2VecModel_2.save('word2VecModel_2') word2VecModel_3 = word2vec.Word2Vec(sentences = sentences_train, size = embeddingSize, min_count = miniFreq, window = 10, workers = multiprocessing.cpu_count(), sg = 1, iter = 20) word2VecModel_3.save('word2VecModel_3') word2VecModel_4 = word2vec.Word2Vec(sentences = sentences_train_validation, size = embeddingSize, min_count = miniFreq, window = 10, workers = multiprocessing.cpu_count(), sg = 1, iter = 20) word2VecModel_4.save('word2VecModel_4') import gensim
size=num_features, min_count=min_word_count, window=context, sample=downsampling,hashfxn=myhash) python 2.x declaration would be model = word2vec.Word2Vec(bagOfsentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling ) ''' print("Training model...") model = word2vec.Word2Vec(bagOfsentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling) """ If you don't plan to train the model any further, calling init_sims will make the model much more memory-efficient If `replace` is set, forget the original vectors and only keep the normalized ones = saves lots of memory! """ model.init_sims(replace=False) # save the model for later use # for loading, call Word2Vec.load() model.save("../../classifier/Word2VectforNLPTraining")
def train_word2vec(self): sentences = word2vec.Text8Corpus (self.args['all_text_path']) model = word2vec.Word2Vec (sentences, size=128, negative=5, min_count=2, window=5) model.save (self.args['save_word2vec_model'])