def load_save_model(w2v_bin_path, vocab_path, save_txt_path): # load model(加载模型的方法) # 注意:不同的保存方式对应不同的加载模型的方式 # skip_gram_model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) skip_gram_model = Word2Vec.load(w2v_bin_path) print(skip_gram_model.most_similar("车子")) #构建词表:词:词向量 word_dict = {} # 从模型中加载词向量 # 问题:一次性加载model中的所有词向量(依据词向量构建词表),如果model中词向量很大,则内存吃不消 # for word in skip_gram_model.wv.vocab: # word_dict[word] = skip_gram_model[word] #字典word_dict中的存储形式为: 词:词对应的词向量 # 构建embedding_matrix vocab = Vocab(vocab_path, VOCAB_SIZE) for word, index in vocab.word2id.items(): #注:若要使用腾讯的词向量,只要在加载skip_gram_model时w2v_bin_path用腾讯词向量的路径 #但是上面vocab_path还是要用自己的vocab.txt文件的路径 if word in skip_gram_model.wv.vocab: #构建embedding层 word_dict[index] = skip_gram_model[word] # 即为后面所用到的embedding_matrix else: #随机初始化,值的大小为-0.025到0.025,词向量维度为256 word_dict[index] = np.random.uniform(-0.025, 0.025, (EMBEDDING_DIM)) # 将从模型中加载的数据进行压缩保存,保存为二进制文件,节约空间 dump_pkl(word_dict, save_txt_path, overwrite=True)
def tf_word_feature(self, data_set): """ Get TF feature by word :param data_set: :return: """ data_set = get_word_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = CountVectorizer(analyzer='word', encoding='utf-8', lowercase=True, vocabulary=self.word_vocab) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ logger.info('Vocab size:%d' % len(vocab)) logger.debug('Vocab list:') count = 0 for k, v in self.vectorizer.vocabulary_.items(): if count < 10: logger.debug("%s %s" % (k, v)) count += 1 feature_names = self.vectorizer.get_feature_names() logger.info('feature_names:%s' % feature_names[:20]) logger.info(data_feature.shape) if not self.is_infer: dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
def tfidf_word_feature(self, data_set): """ Get TFIDF ngram feature by word :param data_set: :return: """ data_set = get_word_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), sublinear_tf=True) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ print('Vocab size:', len(vocab)) print('Vocab list:') count = 0 for k, v in self.vectorizer.vocabulary_.items(): if count < 10: print(k, v) count += 1 print('\nIFIDF词频矩阵:') print('data_feature shape:', data_feature.shape) print(data_feature.toarray()) dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
def tfidf_char_feature(self, data_set): """ Get TFIDF feature by char :param data_set: :return: """ data_set = get_char_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), sublinear_tf=True) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ logger.info('Vocab size:%d' % len(vocab)) logger.debug('Vocab list:') count = 0 for k, v in self.vectorizer.vocabulary_.items(): if count < 10: logger.debug("%s %s" % (k, v)) count += 1 logger.info(data_feature.shape) if not self.is_infer: dump_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
def build(path1, path2, path3, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): sentences = extract_sentence(path1, path2, path3) save_sentence(sentences, sentence_path) print('train w2v model...') # train model w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), size=256, negative=5, workers=8, iter=40, window=3, min_count=min_count) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # test sim = w2v.wv.similarity('宝马', '车主') print('宝马 vs 车主 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'): sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep) save_sentence(sentences, sentence_path) print('train w2v model...') # train model w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), size=256, window=5, min_count=min_count, iter=40) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # test # sim = w2v.wv.similarity('大', '小') # print('大 vs 小 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_seg_x_path, train_seg_target_path, test_seg_x_path, w2v_output, sentence_path, w2v_bin_path="model.bin", embedding_size=256, min_count=5, col_sep='\t'): # sentences = extract_sentence(train_seg_x_path, train_seg_target_path, test_seg_x_path, col_sep=col_sep) # save_sentence(sentences, sentence_path) # # print('train w2v model...') # # train model # model = Word2Vec(sg=1, sentences=LineSentence(sentence_path), # size=embedding_size, window=5, min_count=min_count, iter=40) # model.wv.save_word2vec_format(w2v_bin_path, binary=True) # print("save %s ok." % w2v_bin_path) # # test # sim = model.wv.similarity('奔驰', '宝马') # print('奔驰 vs 宝马 similarity score:', sim) model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word2vec_dict = {} for word in model.vocab: word2vec_dict[word] = model.word_vec(word) dump_pkl(word2vec_dict, w2v_output, True)
def _show_all_labels(self): # split labeled data and unlabeled data output = [] contents = [] seg_contents = [] features = [] labels = [] for i in self.samples: label = i.human_label if i.human_label else i.machine_label output.append(label + self.col_sep + str(i.prob)) seg_contents.append(i.seg_text_word) contents.append(i.original_text) labels.append(label) features.append(i.feature.toarray().tolist()[0]) # get data feature X_train, X_val, y_train, y_val = train_test_split( csr_matrix(np.array(features)), labels) # fit self.model.fit(X_train, y_train) # save model dump_pkl(self.model, self.model_save_path, overwrite=True) eval(self.model, X_val, y_val) save(output, ture_labels=None, pred_save_path=self.pred_save_path, data_set=contents)
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): # sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path) # save_sentence(sentences, sentence_path) print('train w2v model...') # train model """ 通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256 your code w2v = (one line) """ # sentences=[line.strip().split() for line in sentences] # sentences=LineSentence(sentence_path) #sentences为二维list # w2v=Word2Vec(sentences=sentences,size=256,sg=1,window=5) # w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) # print("save %s ok." % w2v_bin_path) # test # sim = w2v.wv.similarity('技师', '车主') # print('技师 vs 车主 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} print(model["说"]) for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_x_seg_path, train_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): sentences = extract_sentence(train_x_seg_path, train_y_seg_path, test_seg_path) save_sentence(sentences, sentence_path) print('train w2v model...') #train model """ 通过gensim工具完成word2vec的训练,输入格式采用sentence,使用skip-gram,embedding维度为256 """ w2v = Word2Vec(sentences=LineSentence(sentence_path), size=256, min_count=min_count, sg=1, workers=8, iter=50) # w2v.wv.save_word2vec_format('{}/datasets/self_word2vec.txt'.format(BASE_DIR),binary=False) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) #test sim = w2v.wv.similarity('技师', '车主') print('技师 vs 车主 similarity score:', sim) #load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build(train_x_seg_path, train_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="../data/w2v.bin", min_count=1): sentences = extract_sentence(train_x_seg_path, train_y_seg_path, test_seg_path) save_sentence(sentences, sentence_path) print('train w2v model...') # train model """ 通过gensim工具完成word2vec的训练,输入格式采用sentences,使用skip-gram,embedding维度256 your code w2v = (one line) """ w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), size=256, window=5, min_count=min_count, iter=40) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # test sim = w2v.wv.similarity('技师', '车主') print('技师 vs 车主 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def _train(self, labeled_sample_list, unlabeled_sample_list, batch_id): machine_samples_list = [] # get data feature labeled_data_label = [ i.human_label if i.human_label else i.machine_label for i in labeled_sample_list ] labeled_data_feature = [ i.feature.toarray().tolist()[0] for i in labeled_sample_list ] X_train, X_val, y_train, y_val = train_test_split( csr_matrix(np.array(labeled_data_feature)), labeled_data_label) # fit self.model.fit(X_train, y_train) # save model dump_pkl(self.model, self.model_save_path, overwrite=True) eval(self.model, X_val, y_val) # 预测未标注数据集 unlabeled_data_feature = [ i.feature.toarray().tolist()[0] for i in unlabeled_sample_list ] if not unlabeled_sample_list: return machine_samples_list pred_result = self.model.predict_proba( csr_matrix(np.array(unlabeled_data_feature))) pred_label_proba = [(self.id_label[prob.argmax()], prob.max()) for prob in pred_result] # save middle result pred_output = [ self.id_label[prob.argmax()] + self.col_sep + str(prob.max()) for prob in pred_result ] pred_save_path = self.pred_save_path[:-4] + '_batch_' + str( batch_id) + '.txt' logger.debug("save infer label and prob result to: %s" % pred_save_path) unlabeled_data_text = [i.original_text for i in unlabeled_sample_list] save(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=unlabeled_data_text) assert len(unlabeled_sample_list) == len(pred_label_proba) for unlabeled_sample, label_prob in zip(unlabeled_sample_list, pred_label_proba): idx = unlabeled_sample.id self.samples[idx].machine_label = label_prob[0] self.samples[idx].prob = label_prob[1] machine_samples_list.append(unlabeled_sample) return machine_samples_list
def build(train_x_seg_path, test_y_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1): sentences = extract_sentence(train_x_seg_path, test_y_seg_path, test_seg_path) save_sentence(sentences, sentence_path) print('train w2v model...') # train model w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), size=256, window=5, min_count=min_count, iter=40) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) # test model_test(model, '技师', '车主') word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, out_path, overwrite=True)
def build_pos_embedding(path, overwrite=False, pos_vocab_path=None, pos_vocab_start=1, pos_dim=64): if os.path.exists(path) and not overwrite: print("already has $s and use it." % path) return load_pkl(path) pos_vocab = load_vocab(pos_vocab_path) pos_vocab_count = len(pos_vocab) + pos_vocab_start pos_emb = np.random.normal(size=( pos_vocab_count, pos_dim, )).astype('float32') for i in range(pos_vocab_start): pos_emb[i, :] = 0. # save dump_pkl(pos_emb, path, overwrite=True) return pos_emb
def save_w2v(bin_path, pkl_out_path, min_count=100): sentences = extract_sentence(QA_TRAIN_CLEAN_X_PATH, QA_TRAIN_CLEAN_Y_PATH, QA_TEST_CLEAN_X_PATH) save_sentence(sentences, QA_SENTENCE_PATH) print('train w2v model...') # train model w2v = Word2Vec(sg=1, sentences=LineSentence(QA_SENTENCE_PATH), size=256, window=5, min_count=min_count, iter=5) w2v.wv.save_word2vec_format(bin_path, binary=True) print("save w2v model %s ok." % bin_path) model = KeyedVectors.load_word2vec_format(bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] dump_pkl(word_dict, pkl_out_path, overwrite=True)
def train_classic(model_type, data_path=None, pr_figure_path=None, model_save_path=None, vectorizer_path=None, col_sep=',', thresholds=0.5, num_classes=2, feature_type='tfidf_char'): data_content, data_lbl = data_reader(data_path, col_sep) # init feature feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=vectorizer_path) # get data feature data_feature = feature.get_feature() # label data_label = feature.label_encoder(data_lbl) X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) model = get_model(model_type) # fit model.fit(X_train, y_train) # save model dump_pkl(model, model_save_path, overwrite=True) # evaluate eval(model, X_val, y_val, thresholds=thresholds, num_classes=num_classes, model_type=model_type, pr_figure_path=pr_figure_path)
def build_word_embedding(path, overwrite=False, sentence_w2v_path=None, word_vocab_path=None, word_vocab_start=2, w2v_dim=256): if os.path.exists(path) and not overwrite: print("already has $s and use it." % path) return load_pkl(path) word_vocab = load_vocab(word_vocab_path) w2v_dict_full = load_pkl(sentence_w2v_path) word_vocab_count = len(w2v_dict_full) + word_vocab_start word_emb = np.zeros((word_vocab_count, w2v_dim), dtype='float32') for word in word_vocab: index = word_vocab[word] if word in w2v_dict_full: word_emb[index, :] = w2v_dict_full[word] else: random_vec = np.random.uniform(-0.25, 0.25, size=(w2v_dim, )).astype('float32') word_emb[index, :] = random_vec # save dump_pkl(word_emb, path, overwrite=True) return word_emb
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.info('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': dump_pkl(model, model_save_path, overwrite=True) # analysis lr model if model_type == "logistic_regression" and config.is_debug: # show each category top features weights = model.coef_ vectorizer = load_pkl(feature_vec_path) logger.debug("20 top features of each category:") features = dict() for idx, weight in enumerate(weights): feature_sorted = sorted(zip(vectorizer.get_feature_names(), weight), key=lambda k: k[1], reverse=True) logger.debug("category_" + str(idx) + ":") logger.debug(feature_sorted[:20]) feature_dict = {k[0]: k[1] for k in feature_sorted} features[idx] = feature_dict dump_pkl(features, 'output/lr_features.pkl', overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)