def _create_single_embedding(self, features): """ Learning an embedding from a feature hash table. :param features: A hash table with node keys and feature list values. :return embedding: Numpy array of embedding. """ document_collections = create_documents(features) model = Doc2Vec(document_collections, vector_size=self.args.dimensions, window=0, min_count=self.args.min_count, alpha=self.args.alpha, dm=0, negative=self.args.negative_samples, ns_exponent=self.args.exponent, min_alpha=self.args.min_alpha, sample=self.args.down_sampling, workers=self.args.workers, epochs=self.args.epochs) embedding = np.array([ model.docvecs[str(node)] for node in range(self.graph.number_of_nodes()) ]) return embedding
def create_embedding(self): """ Fitting an embedding. """ document_collections = create_documents(self.pooled_features) model = Doc2Vec(document_collections, vector_size=self.args.dimensions, window=0, min_count=self.args.min_count, alpha=self.args.alpha, dm=0, min_alpha=self.args.min_alpha, sample=self.args.down_sampling, workers=self.args.workers, epochs=self.args.epochs) embedding = np.array( [model.docvecs[str(node)] for node in self.graph.nodes()]) return embedding
def process_train(corpus_train_path,corpus_test_path,prf_file,base_model_weight=None,flag=6): # 训练语料 raw_train_file = [corpus_train_path + os.sep + type_path + os.sep + type_file \ for type_path in os.listdir(corpus_train_path) \ for type_file in os.listdir(corpus_train_path + os.sep + type_path)] raw_test_file = [corpus_test_path + os.sep + type_path + os.sep + type_file \ for type_path in os.listdir(corpus_test_path) \ for type_file in os.listdir(corpus_test_path + os.sep + type_path)] if flag == 4:# 0 为padding的label 4tag label_2_index = {'Pad': 0, 'B': 1, 'M': 2, 'E': 3, 'S': 4, 'Unk': 5} index_2_label = {0: 'Pad', 1: 'B', 2: 'M', 3: 'E', 4: 'S', 5: 'Unk'} utils.process_data(raw_train_file, 'train.data') utils.process_data(raw_test_file, 'test.data') else: # 6tag label_2_index = {'Pad': 0, 'B': 1, 'B2': 2, 'B3': 3, 'M': 4, 'E': 5, 'S': 6, 'Unk': 7} index_2_label = {0: 'Pad', 1: 'B', 2: 'B2', 3: 'B3', 4: 'M', 5: 'E', 6: 'S', 7: 'Unk'} utils.process_dataB(raw_train_file, 'train.data') utils.process_dataB(raw_test_file, 'test.data') class_label_count = len(label_2_index) train_documents = utils.create_documents('train.data') test_documents = utils.create_documents('test.data') # 生成词典 lexicon, lexicon_reverse = utils.get_lexicon(train_documents+test_documents) # 词典内字符个数 print(len(lexicon), len(lexicon_reverse)) print(len(test_documents)) # 测试语料划分句子个数 print(len(train_documents)) # 训练语料划分句子个数 #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-50.m') #size = 50 #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005.m') #size = 100 #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-150.m') #size = 150 embedding_model = gensim.models.Word2Vec.load(r'model_embedding_pku_100.m') #size = 200 embedding_size = embedding_model.vector_size print(embedding_size) # 预训练词向量 embedding_weights = utils.create_embedding(embedding_model, embedding_size, lexicon_reverse) print(embedding_weights.shape) train_data_list, train_label_list, train_index_list=utils.create_matrix(train_documents,lexicon,label_2_index) test_data_list, test_label_list, test_index_list=utils.create_matrix(test_documents,lexicon,label_2_index) print(len(train_data_list), len(train_label_list), len(train_index_list)) print(len(test_data_list), len(test_label_list), len(test_index_list)) # print(train_data_list[0]) # print(train_label_list[0]) #查看句子长度分布 #print("查看句子长度分布") #visualization.plot_sentence_length(train_data_list+test_data_list,train_label_list+test_label_list) max_len = max(map(len, train_data_list)) print('maxlen:', max_len) #if max_len > 64: # max_len = 64 print('maxlen:', max_len) train_data_array, train_label_list_padding = utils.padding_sentences(train_data_list, train_label_list, max_len) #定长 都是二维数据 test_data_array, test_label_list_padding = utils.padding_sentences(test_data_list, test_label_list, max_len) print(train_data_array.shape) print(test_data_array.shape) #print(train_data_array[0]) train_label_array = np_utils.to_categorical(train_label_list_padding, class_label_count). \ reshape((len(train_label_list_padding), len(train_label_list_padding[0]), -1)) test_label_array = np_utils.to_categorical(test_label_list_padding, class_label_count). \ reshape((len(test_label_list_padding), len(test_label_list_padding[0]), -1)) # 实现多分类问题 变成三维数据 # 测试用的句子个数 * 句子长度 * 6 print(train_label_array.shape) print(test_label_array.shape) # model model = CNN_Bilstm_Crf(max_len, len(lexicon), class_label_count, embedding_weights, embedding_size, model_type) print(model.input_shape) print(model.output_shape) model.summary() model_name = 'model_%d.png'%model_type #plot_model(model, to_file=model_name, show_shapes=True, show_layer_names=True) train_nums = len(train_data_array) # 对应的train_data_list填充0后就是 train_data_array 填充0后的字在字典中的索引 train_array, val_array = train_data_array[:int(train_nums * 0.9)], train_data_array[int(train_nums * 0.9):] # 0.9的行用于训练 0.1的行用于防止过拟合 train_label, val_label = train_label_array[:int(train_nums * 0.9)], train_label_array[int(train_nums * 0.9):] checkpointer = ModelCheckpoint(filepath='train_model_pku_100_m6.hdf5', verbose=1, \ save_best_only=True, monitor='val_loss', mode='auto') hist = model.fit(train_array, train_label, batch_size=256, epochs=4, verbose=1,validation_data=(val_array,val_label),callbacks=[checkpointer]) # save model model.save_weights('train_model_pku_100_m6.hdf5') print(hist.history['val_loss']) # 记录下每次的平均损失大小 best_model_epoch = np.argmin(hist.history['val_loss']) print('best_model_epoch:', best_model_epoch) # 可视化loss acc #visualization.plot_acc_loss(hist) #visualization.plot_acc(hist) #visualization.plot_loss(hist) print(hist.history) model.load_weights('train_model_pku_100_m6.hdf5') # test_data_array 是测试句子个数 * 句子索引中各字在字典中(填充0后)的长度(填充0后) test_y_pred = model.predict(test_data_array,batch_size=256,verbose=1) # 本函数按batch获得输入数据对应的输出,函数的返回值是预测值的numpy array print("test_y_pred.shape:") # 测试句子个数 * 测试句子长度 * 5 print(test_y_pred.shape) #句子个数 * 句子长度 * 5 # pred_label是预测出的标签 [0,0,....,1,2,3,1] 句子个数 * 句子长度 pred_label = np.argmax(test_y_pred,axis=2) # 预测出的测试句子个数 * 句子长度 # save lexicon pickle.dump([lexicon, lexicon_reverse, max_len, index_2_label], open('lexicon_pku_100_m6.pkl', 'wb')) K.clear_session() # 清除session中的缓存数据 # 生成输出文档 # 字典大小 lexicon_reverse: {index:char} real_text_list, pred_text_list, real_label_list, pred_label_list = utils.create_pred_text( \ lexicon_reverse, test_data_array, pred_label, test_label_list_padding, test_index_list, class_label_count) # {index:char}, 测试句子个数 * 句子长度(填充0后),# 预测出的测试句子个数 * 句子长度,test_label_list_padding对应标签填充0后的数据,每一行的索引 # 写进文件 utils.write_2_file(real_text_list, pred_text_list) # score F = score.prf_score('real_text.txt', 'pred_text.txt', prf_file,model_type, best_model_epoch,class_label_count) # 返回平均值