Esempio n. 1
0
    def _create_single_embedding(self, features):
        """
        Learning an embedding from a feature hash table.
        :param features: A hash table with node keys and feature list values.
        :return embedding: Numpy array of embedding.
        """
        document_collections = create_documents(features)

        model = Doc2Vec(document_collections,
                        vector_size=self.args.dimensions,
                        window=0,
                        min_count=self.args.min_count,
                        alpha=self.args.alpha,
                        dm=0,
                        negative=self.args.negative_samples,
                        ns_exponent=self.args.exponent,
                        min_alpha=self.args.min_alpha,
                        sample=self.args.down_sampling,
                        workers=self.args.workers,
                        epochs=self.args.epochs)

        embedding = np.array([
            model.docvecs[str(node)]
            for node in range(self.graph.number_of_nodes())
        ])
        return embedding
Esempio n. 2
0
    def create_embedding(self):
        """
        Fitting an embedding.
        """
        document_collections = create_documents(self.pooled_features)

        model = Doc2Vec(document_collections,
                        vector_size=self.args.dimensions,
                        window=0,
                        min_count=self.args.min_count,
                        alpha=self.args.alpha,
                        dm=0,
                        min_alpha=self.args.min_alpha,
                        sample=self.args.down_sampling,
                        workers=self.args.workers,
                        epochs=self.args.epochs)

        embedding = np.array(
            [model.docvecs[str(node)] for node in self.graph.nodes()])
        return embedding
Esempio n. 3
0
def process_train(corpus_train_path,corpus_test_path,prf_file,base_model_weight=None,flag=6):

    # 训练语料
    raw_train_file = [corpus_train_path + os.sep + type_path + os.sep + type_file \
                      for type_path in os.listdir(corpus_train_path) \
                      for type_file in os.listdir(corpus_train_path + os.sep + type_path)]

    raw_test_file = [corpus_test_path + os.sep + type_path + os.sep + type_file \
                      for type_path in os.listdir(corpus_test_path) \
                      for type_file in os.listdir(corpus_test_path + os.sep + type_path)]

    if flag == 4:# 0 为padding的label 4tag
        label_2_index = {'Pad': 0, 'B': 1, 'M': 2, 'E': 3, 'S': 4, 'Unk': 5}
        index_2_label = {0: 'Pad', 1: 'B', 2: 'M', 3: 'E', 4: 'S', 5: 'Unk'}
        utils.process_data(raw_train_file, 'train.data')
        utils.process_data(raw_test_file, 'test.data')
    else: # 6tag
        label_2_index = {'Pad': 0, 'B': 1, 'B2': 2, 'B3': 3, 'M': 4, 'E': 5, 'S': 6, 'Unk': 7}
        index_2_label = {0: 'Pad', 1: 'B', 2: 'B2', 3: 'B3', 4: 'M', 5: 'E', 6: 'S', 7: 'Unk'}
        utils.process_dataB(raw_train_file, 'train.data')
        utils.process_dataB(raw_test_file, 'test.data')
    
    class_label_count = len(label_2_index)

    train_documents = utils.create_documents('train.data')
    test_documents = utils.create_documents('test.data')
    # 生成词典
    lexicon, lexicon_reverse = utils.get_lexicon(train_documents+test_documents)
    # 词典内字符个数
    print(len(lexicon), len(lexicon_reverse))

    print(len(test_documents))  # 测试语料划分句子个数
    print(len(train_documents)) # 训练语料划分句子个数

    #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-50.m') #size = 50
    #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005.m') #size = 100
    #embedding_model = gensim.models.Word2Vec.load(r'model_embedding_bakeoff2005-150.m') #size = 150

    embedding_model = gensim.models.Word2Vec.load(r'model_embedding_pku_100.m') #size = 200
    embedding_size = embedding_model.vector_size
    print(embedding_size)

    # 预训练词向量
    embedding_weights = utils.create_embedding(embedding_model, embedding_size, lexicon_reverse)
    print(embedding_weights.shape)
    
    train_data_list, train_label_list, train_index_list=utils.create_matrix(train_documents,lexicon,label_2_index)
    test_data_list, test_label_list, test_index_list=utils.create_matrix(test_documents,lexicon,label_2_index)
    

    print(len(train_data_list), len(train_label_list), len(train_index_list))
    print(len(test_data_list), len(test_label_list), len(test_index_list))
    # print(train_data_list[0])
    # print(train_label_list[0])
    #查看句子长度分布
    #print("查看句子长度分布")
    #visualization.plot_sentence_length(train_data_list+test_data_list,train_label_list+test_label_list)

    max_len = max(map(len, train_data_list))
    print('maxlen:', max_len)
    #if max_len > 64:
    #    max_len = 64
    print('maxlen:', max_len)

    train_data_array, train_label_list_padding = utils.padding_sentences(train_data_list, train_label_list, max_len) #定长 都是二维数据
    test_data_array, test_label_list_padding = utils.padding_sentences(test_data_list, test_label_list, max_len)

    print(train_data_array.shape)
    print(test_data_array.shape)
    #print(train_data_array[0])

    train_label_array = np_utils.to_categorical(train_label_list_padding, class_label_count). \
        reshape((len(train_label_list_padding), len(train_label_list_padding[0]), -1))

    test_label_array = np_utils.to_categorical(test_label_list_padding, class_label_count). \
        reshape((len(test_label_list_padding), len(test_label_list_padding[0]), -1))  # 实现多分类问题  变成三维数据
    # 测试用的句子个数 * 句子长度 * 6
    print(train_label_array.shape)
    print(test_label_array.shape)

    # model
    model = CNN_Bilstm_Crf(max_len, len(lexicon), class_label_count, embedding_weights, embedding_size, model_type)
    print(model.input_shape)
    print(model.output_shape)
    model.summary()
    model_name = 'model_%d.png'%model_type
    #plot_model(model, to_file=model_name, show_shapes=True, show_layer_names=True)

    train_nums = len(train_data_array)  # 对应的train_data_list填充0后就是 train_data_array  填充0后的字在字典中的索引

    train_array, val_array = train_data_array[:int(train_nums * 0.9)], train_data_array[int(train_nums * 0.9):]  # 0.9的行用于训练 0.1的行用于防止过拟合
    train_label, val_label = train_label_array[:int(train_nums * 0.9)], train_label_array[int(train_nums * 0.9):]

    checkpointer = ModelCheckpoint(filepath='train_model_pku_100_m6.hdf5', verbose=1, \
                                   save_best_only=True, monitor='val_loss', mode='auto')

    hist = model.fit(train_array, train_label, batch_size=256, epochs=4, verbose=1,validation_data=(val_array,val_label),callbacks=[checkpointer])

    # save model
    model.save_weights('train_model_pku_100_m6.hdf5')

    print(hist.history['val_loss'])  # 记录下每次的平均损失大小
    best_model_epoch = np.argmin(hist.history['val_loss'])
    print('best_model_epoch:', best_model_epoch)

    # 可视化loss acc
    #visualization.plot_acc_loss(hist)
    #visualization.plot_acc(hist)
    #visualization.plot_loss(hist)

    print(hist.history)

    model.load_weights('train_model_pku_100_m6.hdf5')
    # test_data_array 是测试句子个数 * 句子索引中各字在字典中(填充0后)的长度(填充0后)
    test_y_pred = model.predict(test_data_array,batch_size=256,verbose=1) # 本函数按batch获得输入数据对应的输出,函数的返回值是预测值的numpy array
    print("test_y_pred.shape:")  # 测试句子个数 * 测试句子长度 * 5
    print(test_y_pred.shape) #句子个数 * 句子长度 * 5
	# pred_label是预测出的标签 [0,0,....,1,2,3,1]  句子个数 * 句子长度
    pred_label = np.argmax(test_y_pred,axis=2)  # 预测出的测试句子个数 * 句子长度

    # save lexicon
    pickle.dump([lexicon, lexicon_reverse, max_len, index_2_label], open('lexicon_pku_100_m6.pkl', 'wb'))

    K.clear_session()  # 清除session中的缓存数据
    # 生成输出文档
    # 字典大小 lexicon_reverse: {index:char}
    real_text_list, pred_text_list, real_label_list, pred_label_list = utils.create_pred_text( \
        lexicon_reverse, test_data_array, pred_label, test_label_list_padding, test_index_list, class_label_count)
    # {index:char}, 测试句子个数 * 句子长度(填充0后),# 预测出的测试句子个数 * 句子长度,test_label_list_padding对应标签填充0后的数据,每一行的索引
    # 写进文件
    utils.write_2_file(real_text_list, pred_text_list)
    # score
    F = score.prf_score('real_text.txt', 'pred_text.txt', prf_file,model_type, best_model_epoch,class_label_count)  # 返回平均值