def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path) save_sentence(sentences, sentence_path) print('train w2v model...') # train model """ 通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256 your code w2v = (one line) """ # print(type(sentences)) # print(sentences) sentence = LineSentence(sentence_path) w2v = Word2Vec(sentence, size = 256, sg = 1, min_count = 4, workers = 4) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # test sim = w2v.wv.similarity('技师', '车主') print('技师 vs 车主 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_texts_path, train_questions_path, train_answers_path, test_texts_path, test_answers_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): sentences = extract_sentence(train_texts_path, train_questions_path, train_answers_path, test_texts_path, test_answers_path) save_sentence(sentences, sentence_path) print('train w2v model...') # train model """ 通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256 """ w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), size=256, window=5, min_count=min_count, iter=40) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path) save_sentence(sentences, sentence_path) print(len(sentences)) print('train w2v model...') # train model """ 通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256 your code w2v = (one line) """ w2v = Word2Vec(LineSentence(sentence_path), size=256, sg=1, min_count=1) # LineSentence把txt文件转为所需要的格式(已经分词,以空格隔开) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) # 模型保存为bin格式 print("save %s ok." % w2v_bin_path) # test sim = w2v.wv.similarity('技师', '车主') # 查看两个词向量的相近程度 print('技师 vs 车主 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) # #加载训练的词向量 word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path) save_sentence(sentences, sentence_path) print('train w2v model...') # train model w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), size=256, window=5, min_count=min_count, iter=40) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # test sim = w2v.wv.similarity('技师', '车主') print('技师 vs 车主 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=100): sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path) save_sentence(sentences, sentence_path) print('train w2v model...') # 训练模型,LineSentence是读取句子的方法 w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), size=256, window=5, min_count=min_count, iter=5) # 保存模型 w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # 词向量模型效果测试 sim = w2v.wv.similarity('技师', '车主') print('技师 vs 车主 similarity score:', sim) # 加载我们的词向量模型,这个模型之前是以二进制形式保存的,加载的包再KeyedVectors里 model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path) save_sentence(sentences, sentence_path) print('train w2v model...') # train model # 通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256 # global w2v w2v = Word2Vec(sentences=LineSentence(sentence_path), sg=1, size=256, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4) #用LineSentence把一个txt文件转为所需要的格式 PathLineSentence把一个文件夹里所有text转为一句话一个列表。 # w2v.save('word2vec.model') # loaded_model = Word2Vec.load('word2vec.model') # wv = w2v.wv # del w2v # wv.save('word_vector') def cal_similarity(self, test_word_id): sim_matrix = self.sess.run(self.similarity, feed_dict={self.test}) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # test sim = w2v.wv.similarity('技师', '车主') print('技师 vs 车主 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path) save_sentence(sentences, sentence_path) print('train w2v model...') # train model """ 通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256 your code w2v = (one line) """ # 训练skip-gram模型 # min_count,频数阈值,大于等于1的保留 # size,神经网络 NN 层单元数,它也对应了训练算法的自由程度 # workers=4,default = 1 worker = no parallelization 只有在机器已安装 Cython 情况下才会起到作用。如没有 Cython,则只能单核运行。 w2v = Word2Vec(sentences, size=256, window=5, min_count=1, workers=4, sg=1) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok.__________------" % w2v_bin_path) # test sim = w2v.wv.similarity('技师', '车主') print('技师 vs 车主 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) print(type(model)) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): # 读取三个文件源然后合并三个文件中的句子 # 根据col_sep进行拆分词 sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path) print(sentences[:5]) save_sentence(sentences, sentence_path) print('train w2v model...') # train model """ 通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256 your code w2v = (one line) """ # 如果模型还未训练过,则开始训练,否则的话跳过训练,直接加载模型 if not os.path.exists(w2v_bin_path): model = Word2Vec(sentences, size=256, window=3, min_count=1, workers=4) model.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) # test print(model['技师']) sim = model.wv.similarity(u'技师', u'车主') print('技师 vs 车主 similarity score:', sim) # 打印出来为0.7745 # 存储词向量数据 word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def train(): expName = 'ptr128_image_{0}_text_{1}'.format(str(return_image), str(return_text)) model = ptrnet_model(input_dim=input_dim, hiddenStates=128, parallel=True) model.load_weights('weights_ptr128_image_True_text_True.h5') train_losses = [] test_losses = [] print input_dim min_loss = 1.68 for ep in range(nb_epoch): ep_start = time() print "Epoch:", ep + 1 _loss = 0.0 for b, (x, y) in enumerate(generate_batches(batch_size)): h = model.fit(x, y, batch_size=4096 * 4, verbose=0, nb_epoch=1) _loss += h.history['loss'][0] print "\n", _loss / b test_loss = 0.0 for b, (x, y) in enumerate(generate_batches(batch_size, training=False)): test_loss += model.evaluate(x, y, verbose=0, batch_size=4096 * 4) test_loss /= b test_losses.append(test_loss) print 'test loss:', test_loss if test_loss < min_loss: print 'Loss improved from {0} to {1}'.format(min_loss, test_loss) min_loss = test_loss print 'Saving model_%s' % expName model.save('model_%s.h5' % expName) model.save_weights('weights_%s.h5' % expName) train_losses.append(_loss) dump_pkl([train_losses, test_losses], 'stories_losses') print time() - ep_start, "seconds for epoch", ep + 1 print "=" * 100
def build(train_x_seg_path, test_y_seg_path, test_seg_path,jiebainput_path,jiebaoutput_path,out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path) save_sentence(sentences, sentence_path) print('train w2v model...') # train model """ 通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256 your code w2v = (one line) """ fileTrainSeg=[] fileTrainSeg=write_jiebatxt(jiebainput_path) # 保存分词结果到文件中 with open(jiebaoutput_path,'w',encoding='utf-8') as fW: for i in range(len(fileTrainSeg)): fW.write(fileTrainSeg[i][0]) fW.write('\n') # 训练skip-gram模型 w2v = Word2Vec(LineSentence(jiebaoutput_path), size=50, window=5, min_count=5, workers=multiprocessing.cpu_count()) w2v.wv.save_word2vec_format(jiebaoutput_path, binary=True) print("save %s ok." % jiebaoutput_path) # test sim = w2v.wv.similarity('技师', '车主') print('技师 vs 车主 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(jiebaoutput_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
if False: (x_train, y_train), (x_test, y_test) = load_data(path="imdb.npz", skip_top=0, maxlen=None, seed=113, start_char=1, oov_char=0, index_from=2) n = 10 n_grams = [] for x in x_train: l = len(x) for i in range(0, l - n + 1, 1): n_grams.append(x[i:i + n]) n_grams = np.array(n_grams) dump_pkl(n_grams, 'sentences_10_grams') else: sentences = load_pkl('sentences_10_grams') L = len(sentences) maxWordIndex = np.max(sentences) print L def create_data(sentences, K=120): x = np.zeros((len(sentences), 10, 1)) y = np.tile(np.eye(10), (len(sentences), 1, 1)) x = sentences