Beispiel #1
0
def test():
    config = CONFIG()
    print('加载word2id===========================')
    word2id = load_word2id(config.word2id_path)
    config.vocab_size = len(word2id)
    print('加载test语料库=========================')
    x, y = load_corpus(config.test_path, word2id, max_sen_len=config.max_sen_len)
    # x, y = x[:10], y[:10]
    model = TextCNN(config)
    with tf.Session() as sess:
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(config.save_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)

        yhat = model.predict(sess, x)

    cat, cat2id = cat_to_id()
    y_cls = np.argmax(y, 1)
    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_cls, yhat, target_names=cat))
    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_cls, yhat)
    print(cm)
Beispiel #2
0
def train():
    config = CONFIG()
    print('加载word2id===========================')
    word2id = load_word2id(config.word2id_file)
    config.vocab_size = len(word2id)
    print('加载word2vec==========================')
    word2vec = load_corpus_word2vec(config.corpus_w2v_file)
    print('加载train语料库========================')
    train = load_corpus(config.train_file,
                        word2id,
                        max_sen_len=config.max_sen_len)
    x_tr = train[:-1]
    y_tr = train[-1]
    print('加载test语料库==========================')
    test = load_corpus(config.test_file,
                       word2id,
                       max_sen_len=config.max_sen_len)
    x_te = test[:-1]
    y_te = test[-1]
    print('训练模型===============================')
    lstm = LSTM(CONFIG, embeddings=word2vec)
    with tf.Session() as sess:
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        lstm.fit(sess, x_tr, y_tr, x_te, y_te, config.save_dir,
                 config.print_per_batch)
def main():

    # 在训练集上构建一元和二元词典
    word2id = load_word2id(length=VOCAB_SIZE)

    # 为深度学习算法准备数据loader
    train_loader_dl = DataLoader(
        dataset=DianPingDataSet("train"),
        batch_size=64,
        collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN)
    )
    test_loader_dl = DataLoader(
        dataset=DianPingDataSet("test"),
        batch_size=64,
        collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN)
    )
    vocab_size = len(word2id)
    print("Vocab Size:", vocab_size)
    print("加载词向量....")
    try:
        embedding = load_embeddings(word2id)
    except FileNotFoundError:
        embedding = None

    # 在深度学习模型上训练测试(CNN, LSTM)
    print("在BiLSTM模型上训练...")
    lstm_model = DeepModel(vocab_size, embedding, method="lstm")
    lstm_model.train_and_eval(train_loader_dl, test_loader_dl)

    print("在CNN模型上训练...")
    cnn_model = DeepModel(vocab_size, embedding, method="cnn")
    cnn_model.train_and_eval(train_loader_dl, test_loader_dl)
Beispiel #4
0
def train():
    config = CONFIG()
    print('加载word2id===========================')
    word2id = load_word2id(config.word2id_path)
    print('加载word2vec==========================')
    word2vec = load_corpus_word2vec(config.corpus_word2vec_path)
    print('加载train语料库========================')
    x_tr, y_tr = load_corpus(config.train_path, word2id, max_sen_len=config.max_sen_len)
    print('加载dev语料库==========================')
    x_val, y_val = load_corpus(config.dev_path, word2id, max_sen_len=config.max_sen_len)
    print('训练模型===============================')
    tc = TextCNN(CONFIG, embeddings=word2vec)
    with tf.Session() as sess:
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        tc.fit(sess, x_tr, y_tr, x_val, y_val, config.save_dir, config.print_per_batch)
Beispiel #5
0
def main():
    """在训练集上构建一元词典和二元词典"""
    word2id = load_word2id(length=VOCAB_SIZE)
    """prepare dataset"""
    train_loader = DataLoader(
        dataset=DPDataSet('train'),
        batch_size=batch_size,
        collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN),
        drop_last=True,
        pin_memory=True,
        # num_workers=4,
        shuffle=True)
    test_loader = DataLoader(dataset=DPDataSet("test"),
                             batch_size=batch_size,
                             collate_fn=partial(collate_fn_dl, word2id,
                                                SENT_MAX_LEN),
                             pin_memory=True,
                             drop_last=True,
                             shuffle=True)
    vocab_size = len(word2id)
    print("Vocab Size:", vocab_size)
    print("加载词向量.....")
    try:
        embedding = load_embeddings(word2id)
    except FileNotFoundError:
        embedding = None
    print("测试BiLSTM:")
    lstm_model = DeepModel(vocab_size, embedding, method="lstm")
    lstm_model.train_and_evel(train_loader, test_loader)

    print("测试CNN:")
    cnn_model = DeepModel(vocab_size, embedding, method="cnn")
    cnn_model.train_and_evel(train_loader, test_loader)

    print("测试selfAttention:")
    att_model = DeepModel(vocab_size, embedding, method="self_att")
    att_model.train_and_evel(train_loader, test_loader)

    print("测试LSTM_Attention:")
    lstm_att_model = DeepModel(vocab_size, embedding, method="lstm_att")
    lstm_att_model.train_and_evel(train_loader, test_loader)
    print("测试RCNN:")
    RCNN_model = DeepModel(vocab_size, embedding, method="rcnn")
    RCNN_model.train_and_evel(train_loader, test_loader)
Beispiel #6
0
def sent_to_id(inputs):
    """
    将语句进行分词,然后将词语转换为word_to_id中的id编码
    """
    sentences = []
    cut_sents = [jb.cut(w) for w in inputs]
    config = CONFIG()
    word2id = load_word2id(config.word2id_path)

    for cut_sent in cut_sents:
        sentence = [word2id.get(w, 0) for w in cut_sent]
        sentence = sentence[:config.max_sen_len]
        if len(sentence) < config.max_sen_len:
            sentence += [word2id['_PAD_']
                         ] * (config.max_sen_len - len(sentence))

        sentences.append(sentence)

    return np.asarray(sentences)
Beispiel #7
0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device', device)

glove_vocab, glove_embeddings = get_glove_embeddings(P.EMBEDDING_DIM)

if does_word2id_exist(P) == False:

    word2id = Word2Id()
    train_data = get_single_dataset(
        "data/experiment_data/bidaf/{}_short/{}-v1.1.json".format(
            P.MERGE_TYPE, "train"), word2id, P.BATCH_SIZE, True,
        P.MIN_OCCURENCE, True, glove_vocab, False)
    save_word2id(P, word2id)

else:
    word2id = load_word2id(P)
    train_data = get_single_dataset(
        "data/experiment_data/bidaf/{}_short/{}-v1.1.json".format(
            P.MERGE_TYPE, "train"), word2id, P.BATCH_SIZE, False,
        P.MIN_OCCURENCE, True, glove_vocab, False)

vocab_size = len(word2id.id2w)
embeddings_matrix = get_embeddings_matrix(glove_embeddings, word2id,
                                          vocab_size, P.EMBEDDING_DIM)

# %% Model
model = Model(word2id, P.HIDDEN_DIM, P.EMBEDDING_DIM, embeddings_matrix,
              P.USE_BILINEAR).to(device)

saliency_loss_fn = nn.MSELoss()
decoder_loss_fn = nn.NLLLoss()
Beispiel #8
0
    update_w2v = True
    n_class = 8
    max_sen_len = 50
    embedding_dim = 50
    batch_size = 160
    output_channels = 20
    n_hidden = 256
    n_epoch = 5
    learning_rate = 0.01
    drop_keep_prob = 0.4
    num_filters = 256
    kernel_size = 3


config = CONFIG()
word2id = load_word2id('./data/word_to_id.txt')
print('加载word2vec==========================')
word2vec = load_corpus_word2vec('./data/corpus_word2vec.txt')
print('加载train语料库========================')
train = load_corpus('./data/train/', word2id, max_sen_len=config.max_sen_len)
print('加载dev语料库==========================')
dev = load_corpus('./data/dev/', word2id, max_sen_len=config.max_sen_len)
print('加载test语料库=========================')
test = load_corpus('./data/test/', word2id, max_sen_len=config.max_sen_len)

x_tr, y_tr = train
x_val, y_val = dev

config = CONFIG()
tc = TextCNN(config=config, embeddings=word2vec)