Ejemplo n.º 1
0
def raw_data_to_model(file_path,
                      tokenizer,
                      word2id,
                      tag2id,
                      batch_size,
                      contain_y=True):
    sample_list_, tag_list_ = construct_data(file_path)
    sample_list_, tag_list_ = sort_sequence(sample_list_, tag_list_)
    x, y, lengths = [], [], []
    for i in range(0, len(sample_list_),
                   batch_size):  # 每个batch按照最大len进行to id操作
        # seq_len_ = max(map(lambda xx: len(xx), sample_list_[i:i+batch_size]))
        seq_len_ = len(sample_list_[i])
        x_, lengths_ = content_to_id(sample_list_[i:i + batch_size],
                                     line_sep=None,
                                     tokenizer=tokenizer,
                                     seq_len=seq_len_,
                                     vocab_dic=word2id,
                                     with_real_seq_len=True)
        if contain_y:
            y_ = content_to_id(tag_list_[i:i + batch_size],
                               line_sep=None,
                               tokenizer=tokenizer,
                               seq_len=seq_len_,
                               vocab_dic=tag2id)
            y.extend(y_.tolist())
        x.extend(x_.tolist())
        lengths.extend(lengths_.tolist())

    if contain_y:
        return np.array(x), np.array(y), np.array(lengths)
    else:
        return np.array(x), np.array(lengths)
Ejemplo n.º 2
0
def construct_data(path, vocab_dic, tokenizer, seq_len, line_sep):
    # cls_index = vocab_dic.get("[CLS]")  # 首位置添加[CLS]
    x, y, lengths = content_to_id(path,
                                  tokenizer=tokenizer,
                                  seq_len=seq_len,
                                  vocab_dic=vocab_dic,
                                  line_sep=line_sep,
                                  with_real_seq_len=True)
    # x = np.insert(x, 0, cls_index, axis=1)
    # mask = np.array([make_sequence_mask(real_len=i + 1, seq_len=seq_len + 1) for i in lengths])  # 多加了一个cls_index
    mask = (x > 0).astype(int)
    print(f"x sample number is {len(x)}, label sample number is {len(y)}")
    return x, y, mask
Ejemplo n.º 3
0
def main_entry(save_dir):
    # seed_everything(987, use_np=True, use_cpu=True, use_gpu=False)

    # [0]. 转换数据格式, 形如 sentences+label
    train_pri_path = "./data/tnews/train.json"
    train_path = "./data/tnews/train_trans.txt"

    valid_pri_path = "./data/tnews/dev.json"
    valid_path = "./data/tnews/dev_trans.txt"

    test_pri_path = "./data/tnews/test.json"
    test_path = "./data/tnews/test_trans.txt"

    label_path = "./data/tnews/labels.json"
    label_dic = get_label_map(label_path)
    transform_data(train_pri_path, label_dic, train_path)
    transform_data(valid_pri_path, label_dic, valid_path)
    transform_data(test_pri_path, label_dic, test_path)

    # [1]. 创建词汇表字典
    # [1.1]. 无词汇表,从指定文件创建并保存
    vocab_file_path = train_path
    save_path = os.path.join(save_dir, "train_vocab.pkl")
    tokenizer = "char"
    line_sep = "\t"

    vocab_dic = build_vocab_by_raw_file(vocab_file_path,
                                        line_sep=line_sep,
                                        tokenizer=tokenizer,
                                        word_dic_save_path=save_path)
    # [1.2]. 有词汇表,从指定文件创建
    # [1.3]. 有词汇表,手动从pickle文件中加载
    # [1.4]. 有词汇表,基于此进行更新

    # [2]. 文本转换为id
    # train_path = "./data/THUCNews/train.txt"
    # valid_path = "./data/THUCNews/dev.txt"
    # test_path = "./data/THUCNews/test.txt"
    seq_len = 100

    train_x, train_y = content_to_id(train_path,
                                     tokenizer=tokenizer,
                                     seq_len=seq_len,
                                     vocab_dic=vocab_dic,
                                     line_sep=line_sep)
    print(
        f"train_x sample number is {len(train_x)}, label sample number is {len(train_y)}"
    )

    valid_x, valid_y = content_to_id(valid_path,
                                     tokenizer=tokenizer,
                                     seq_len=seq_len,
                                     vocab_dic=vocab_dic,
                                     line_sep=line_sep)
    print(
        f"valid_x sample number is {len(valid_x)}, label sample number is {len(valid_y)}"
    )

    # test_x, test_y = content_to_id(test_path, tokenizer=tokenizer, seq_len=seq_len,
    #                                vocab_dic=vocab_dic, line_sep=line_sep)
    # print(f"content sample number is {len(test_x)}, label sample number is {len(test_y)}")
    # [3]. 切分数据为三部分(训练、验证和测试集),(随机切分或者标签比例切分)
    # 当然如果第二步已经切分,则此部分可以忽略
    # train_ind, valid_ind, test_ind = split_data_with_index(indexes=len(content), split_ratios=(0.7, 0.1, 0.2))
    # train_x, train_y = np.array(content)[train_ind], np.array(label)[train_ind]
    # valid_x, valid_y = np.array(content)[valid_ind], np.array(label)[valid_ind]
    # test_x, test_y = content[test_ind], label[test_ind]
    # 也有可能[2]和[3]颠倒,即我首先读取数据,可以通过pandas等等方式,先处理数据,
    # 然后通过切分策略,切分数据,此时数据已经划分为三部分或者两部分,然后再走[2],将这两部分逻辑分别写例子

    # [4]. 数据策略,比如按类别做上采样,下采样;
    # 此时数据已经为numpy格式
    # for i in np.unique(train_y):
    #     print(f"label {i} number is {sum(train_y == i)}")
    # sample_ind = sample_data_by_label(train_y, sampler={"1": 10, "2": 20})
    # train_x, train_y = train_x[sample_ind], train_y[sample_ind]
    # for i in np.unique(train_y):
    #     print(f"label {i} number is {sum(train_y == i)}")

    # [5]. 构建Iterator
    # train_iter = self_iterator(batch_data=(train_x, train_y, ), batch_size=4, )
    # valid_iter = self_iterator(batch_data=(valid_x, valid_y, ), batch_size=4)
    # test_iter = self_iterator(batch_data=(test_x, test_y), batch_size=4)
    batch_size = 128
    small_sample_test = False
    small_sample_num = 10000
    if small_sample_test:
        train_x, train_y = train_x[:
                                   small_sample_num], train_y[:
                                                              small_sample_num]

    train_iter = torch_iterator(batch_data=(
        train_x,
        train_y,
    ),
                                batch_size=batch_size)
    valid_iter = torch_iterator(batch_data=(
        valid_x,
        valid_y,
    ),
                                batch_size=batch_size)
    # test_iter = torch_iterator(batch_data=(test_x, test_y), batch_size=batch_size)

    # [6]. 初始化模型
    seed_everything(1024, use_np=True, use_cpu=True, use_gpu=True)

    # model = TextRNN(vocab_size=len(vocab_dic), embedding_dim=8, hidden_size=20,
    #                 num_layers=2, num_classes=10, dropout=0.5)
    model = TextCNN(num_filters=128,
                    filter_sizes=(2, 3, 4),
                    num_classes=len(label_dic),
                    vocab_size=len(vocab_dic),
                    embedding_dim=300,
                    dropout=0.5)
    init_network(model)
    print(model)

    # [7]. 模型训练
    num_epochs = 6
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    lr = 1e-3
    model_save_path = os.path.join(
        save_dir, "text_cnn_model.pt")  # "./data/THUCNews/text_cnn_model.pt"
    print("now the device is ", device)

    loss = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    clf = SelfModel(model=model)
    t1 = datetime.now()
    clf.train(train_iter,
              num_epochs,
              loss=loss,
              optimizer=optimizer,
              valid_iter=valid_iter,
              early_stopping_batch=100,
              batch_check_frequency=2,
              print_every_batch=10,
              model_save_path=model_save_path,
              device=device)
    t2 = datetime.now()
    print(f"train cost {(t2-t1).seconds} seconds")

    # Epoch Num [6/6], Batch num [395/417]: train loss is 0.5875816802612598 valid loss is 1.1788143689119364
    # Epoch Num [6/6], Batch num [415/417]: train loss is 0.5919032108297737 valid loss is 1.1893426436412184
    # train cost 2202 seconds

    # [8]. 模型预测
    # pred = clf.predict(data=train_iter, do_func=lambda x: x[0])

    # [9]. 查看效果
    def get_max_prob_index(pred):
        return torch.max(pred, 1)[1]

    # pred = torch.nn.functional.softmax(pred, dim=1).cpu().numpy()

    y_score, y_true = evaluate(clf.model,
                               train_iter,
                               y_score_processor=get_max_prob_index)
    train_acc = accuracy_score(y_true, y_score)
    y_score, y_true = evaluate(clf.model,
                               valid_iter,
                               y_score_processor=get_max_prob_index)
    valid_acc = accuracy_score(y_true, y_score)
    # y_score, y_true = evaluate(clf.model, test_iter, y_score_processor=get_max_prob_index)
    # test_acc = accuracy_score(y_true, y_score)
    print(f"train accuracy is {train_acc}, valid accuracy is {valid_acc}.")
    # train accuracy is 0.8219827586206897, valid accuracy is 0.6129.

    # [10]. 对测试集进行预测, 构造线上cluemark提交格式, 提交到线上查看效果
    inverse_label_dic = {}
    for key, val in label_dic.items():
        inverse_label_dic[val["label_index"]] = {
            "label": key,
            "label_desc": val["label_desc"]
        }

    f_out = open("./data/tnews/tnews_predict.json", "w", encoding="utf-8")

    with open(test_path, "r") as f:
        line_num = 0
        for line in f:
            line_json = {"id": line_num}
            line = line.strip("\n")
            line_ids = content_to_id([line],
                                     tokenizer=tokenizer,
                                     seq_len=seq_len,
                                     vocab_dic=vocab_dic)
            line_pred = clf.model(
                torch.LongTensor(line_ids).to(device))  # 返回预测每个类别的预测
            line_pred_ind = torch.max(line_pred, 1)[1].item()  # 获取最大概率对应的index
            line_json.update(inverse_label_dic[line_pred_ind])  # 构造线上格式
            f_out.write(
                f"{json.dumps(line_json, ensure_ascii=False)}\n")  # 写入文件中
            line_num += 1
    f_out.close()
Ejemplo n.º 4
0
def main_entry():
    save_dir = "./data/cluener"
    vocab_file_path = "./data/cluener/train.json"
    tokenizer = lambda x: x  # 输入是list, 相当于已经tokenize

    # 1. 构建词典
    sample_list, tag_list = construct_data(vocab_file_path)   # 1 bad line, 实体嵌套实体
    ## 1.1 构建word2id词典
    # word_save_path = os.path.join(save_dir, "train_word_vocab.pkl")
    word2id = build_vocab_by_raw_file(sample_list, line_sep=None, tokenizer=tokenizer)

    ## 1.2 构建tag2id词典
    # tag_save_path = os.path.join(save_dir, "train_tag_crf_vocab.pkl")
    tag2id = build_vocab_by_raw_file(tag_list, line_sep=None, tokenizer=tokenizer)
    tag2id[START_TAG] = len(tag2id)
    tag2id[END_TAG] = len(tag2id)

    # 2. 构造训练、验证和测试数据
    #    构造三部分数据并将其转换为ID
    train_path = "./data/cluener/train.json"
    valid_path = "./data/cluener/dev.json"
    test_path = "./data/cluener/test.json"
    batch_size = 128

    train_x, train_y, train_lengths = raw_data_to_model(train_path, tokenizer, word2id, tag2id, batch_size)
    print(f"train_x sample number is {len(train_x)}, label sample number is {len(train_y)}")

    valid_x, valid_y, valid_lengths = raw_data_to_model(valid_path, tokenizer, word2id, tag2id, batch_size)
    print(f"valid_x sample number is {len(valid_x)}, label sample number is {len(valid_y)}")

    # test_x, test_y, test_lengths = raw_data_to_model(test_path, tokenizer, word2id, tag2id, batch_size)
    # print(f"test_x sample number is {len(test_x)}, label sample number is {len(test_y)}")

    # 3. 转换数据为迭代器
    # batch_size = 128
    # small_sample_test = False
    # small_sample_num = 10000
    # if small_sample_test:
    #     train_x, train_lengths, train_y = train_x[:small_sample_num], train_lengths[:small_sample_num], train_y[:small_sample_num]

    train_iter = torch_iterator(batch_data=(train_x, train_lengths, train_y,), batch_size=batch_size)
    valid_iter = torch_iterator(batch_data=(valid_x, valid_lengths, valid_y,), batch_size=batch_size)
    # test_iter = torch_iterator(batch_data=(test_x, test_lengths, test_y), batch_size=batch_size)

    # 4. 初始化模型
    seed_everything(1024, use_np=True, use_cpu=True, use_gpu=True)

    model = BiLSTM_CRF(vocab_size=len(word2id), emb_size=50, hidden_size=32, num_tags=len(tag2id),
                       start_idx=tag2id[START_TAG], stop_idx=tag2id[END_TAG])
    init_network(model)
    print(model)

    # 4. 模型训练
    num_epochs = 15
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    lr = 1e-3
    model_save_path = os.path.join(save_dir, "bilstm_crf_model.pt")
    print("now the device is ", device)

    loss = model.crf.loss
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    clf = SelfModel(model=model)
    t1 = datetime.now()
    clf.train(train_iter, num_epochs, loss=loss, optimizer=optimizer, valid_iter=valid_iter,
              early_stopping_batch=30, batch_check_frequency=2,
              print_every_batch=3, model_save_path=model_save_path, device=device)
    t2 = datetime.now()
    print(f"train cost {(t2-t1).seconds} seconds")
    # Epoch Num [15/15], Batch num [84/84]: train loss is 0.24247267132713682 valid loss is 13.60905595259233
    # train cost 1064 seconds

    # 5. 模型评估
    ## 5.1 解码  涉及到crf的解码,没有使用内置的evaluate,自己调用模型进行预测然后进行解码
    decode = model.crf.viterbi_decode
    id2tag_dic = {id_: tag for tag, id_ in tag2id.items()}
    #  y_score, y_true = evaluate(clf.model, train_iter, y_score_processor=get_max_prob_index)
    y_score, y_true = [], []
    for sent, leng, y_true_ in valid_iter:
        y_true_ = y_true_.cpu()
        crf_score = clf.model(sent.to(device), leng.to(device))
        y_score_tag = decode(crf_score.cpu(), sent.gt(0).cpu())[1]

        lengs = leng.cpu().numpy()
        for i in range(len(lengs)):  # 遍历样本
            y_score.append(id2tag(y_score_tag[i][:lengs[i]], id2tag_dic))
            y_true.append(id2tag(y_true_[i][:lengs[i]].numpy(), id2tag_dic))

    ## 5.2 评估指标
    metrices = evaluate_all_sentence(y_true, y_score)
    print(metrices)
    # 3072 2909 1944   共有3072个实体, 模型识别的实体为2909个, 其中1944个预测正确
    # (0.6328125, 0.6682708834651083, 0.6500585186423675)  分别为召回 精准和f1

    # 6. 预测
    # 对测试集进行预测,然后将格式整理为cluemark的格式,提交到线上查看测试集的效果
    with open(test_path, "r") as f:
        y_score = []
        for line in f:
            line = line.strip("\n")
            line_text = json.loads(line)["text"]
            sent, leng = content_to_id([list(line_text)], tokenizer=tokenizer, line_sep=None,
                                       seq_len=len(list(line_text)), vocab_dic=word2id, with_real_seq_len=True)
            crf_score = clf.model(torch.LongTensor(sent).to(device), torch.LongTensor(leng).to(device))
            y_score_tag = decode(crf_score.cpu(), torch.LongTensor(sent).gt(0).cpu())[1]

            y_score.append(id2tag(y_score_tag[0][:leng[0]], id2tag_dic))

    def __submit_format(indexs, sent):
        ret = {}
        for start_idx, end_idx in indexs:
            ner_name = sent[start_idx: end_idx + 1]
            if ner_name in ret:
                ret[ner_name].append([start_idx, end_idx])
            else:
                ret[ner_name] = [[start_idx, end_idx]]
        return ret

    def submit(write_path, test_path):
        with open(test_path, "r", encoding='utf-8') as f:
            test_sample = f.readlines()

        with open(write_path, "w", encoding="utf-8") as f:
            line_num = 0
            for i in range(len(y_score)):
                label = {}
                write_line = {"id": line_num}
                tag_entity = parse_entity_from_sequence(y_score[i])
                line_text = json.loads(test_sample[i])["text"]
                for tag in tag_entity:
                    label[tag] = __submit_format(tag_entity[tag], line_text)
                write_line["label"] = label
                f.write(json.dumps(write_line, ensure_ascii=False) + "\n")
                line_num += 1

    submit("./data/cluener/cluener_predict.json", test_path)
def main_entry(save_dir):
    # seed_everything(987, use_np=True, use_cpu=True, use_gpu=False)
    # [1]. 创建词汇表字典
    # [1.1]. 无词汇表,从指定文件创建并保存
    vocab_file_path = "./data/THUCNews/train.txt"
    save_path = os.path.join(save_dir, "train_vocab.pkl")
    tokenizer = "char"
    line_sep = "\t"

    vocab_dic = build_vocab_by_raw_file(vocab_file_path,
                                        line_sep=line_sep,
                                        tokenizer=tokenizer,
                                        word_dic_save_path=save_path)
    # [1.2]. 有词汇表,从指定文件创建
    # [1.3]. 有词汇表,手动从pickle文件中加载
    # [1.4]. 有词汇表,基于此进行更新

    # [2]. 文本转换为id
    train_path = "./data/THUCNews/train.txt"
    valid_path = "./data/THUCNews/dev.txt"
    test_path = "./data/THUCNews/test.txt"
    seq_len = 32

    train_x, train_y = content_to_id(train_path,
                                     tokenizer=tokenizer,
                                     seq_len=seq_len,
                                     vocab_dic=vocab_dic,
                                     line_sep=line_sep)
    print(
        f"train_x sample number is {len(train_x)}, label sample number is {len(train_y)}"
    )

    valid_x, valid_y = content_to_id(valid_path,
                                     tokenizer=tokenizer,
                                     seq_len=seq_len,
                                     vocab_dic=vocab_dic,
                                     line_sep=line_sep)
    print(
        f"valid_x sample number is {len(valid_x)}, label sample number is {len(valid_y)}"
    )

    test_x, test_y = content_to_id(test_path,
                                   tokenizer=tokenizer,
                                   seq_len=seq_len,
                                   vocab_dic=vocab_dic,
                                   line_sep=line_sep)
    print(
        f"content sample number is {len(test_x)}, label sample number is {len(test_y)}"
    )
    # [3]. 切分数据为三部分(训练、验证和测试集),(随机切分或者标签比例切分)
    # 当然如果第二步已经切分,则此部分可以忽略
    # train_ind, valid_ind, test_ind = split_data_with_index(indexes=len(content), split_ratios=(0.7, 0.1, 0.2))
    # train_x, train_y = np.array(content)[train_ind], np.array(label)[train_ind]
    # valid_x, valid_y = np.array(content)[valid_ind], np.array(label)[valid_ind]
    # test_x, test_y = content[test_ind], label[test_ind]
    # 也有可能[2]和[3]颠倒,即我首先读取数据,可以通过pandas等等方式,先处理数据,
    # 然后通过切分策略,切分数据,此时数据已经划分为三部分或者两部分,然后再走[2],将这两部分逻辑分别写例子

    # [4]. 数据策略,比如按类别做上采样,下采样;
    # 此时数据已经为numpy格式
    # for i in np.unique(train_y):
    #     print(f"label {i} number is {sum(train_y == i)}")
    # sample_ind = sample_data_by_label(train_y, sampler={"1": 10, "2": 20})
    # train_x, train_y = train_x[sample_ind], train_y[sample_ind]
    # for i in np.unique(train_y):
    #     print(f"label {i} number is {sum(train_y == i)}")

    # [5]. 构建Iterator
    # train_iter = self_iterator(batch_data=(train_x, train_y, ), batch_size=4, )
    # valid_iter = self_iterator(batch_data=(valid_x, valid_y, ), batch_size=4)
    # test_iter = self_iterator(batch_data=(test_x, test_y), batch_size=4)
    batch_size = 128
    small_sample_test = True
    small_sample_num = 1000
    if small_sample_test:
        train_x, train_y = train_x[:
                                   small_sample_num], train_y[:
                                                              small_sample_num]

    train_iter = torch_iterator(batch_data=(
        train_x,
        train_y,
    ),
                                batch_size=batch_size)
    valid_iter = torch_iterator(batch_data=(
        valid_x,
        valid_y,
    ),
                                batch_size=batch_size)
    test_iter = torch_iterator(batch_data=(test_x, test_y),
                               batch_size=batch_size)

    # [6]. 初始化模型
    seed_everything(1024, use_np=True, use_cpu=True, use_gpu=True)

    # model = TextRNN(vocab_size=len(vocab_dic), embedding_dim=8, hidden_size=20,
    #                 num_layers=2, num_classes=10, dropout=0.5)
    model = TextCNN(num_filters=128,
                    filter_sizes=(2, 3, 4),
                    num_classes=10,
                    vocab_size=len(vocab_dic),
                    embedding_dim=300,
                    dropout=0.5)
    init_network(model)
    print(model)

    # [7]. 模型训练
    num_epochs = 20
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    lr = 1e-3
    model_save_path = os.path.join(
        save_dir, "text_cnn_model.pt")  # "./data/THUCNews/text_cnn_model.pt"
    print("now the device is ", device)

    loss = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    clf = SelfModel(model=model)
    t1 = datetime.now()
    clf.train(train_iter,
              num_epochs,
              loss=loss,
              optimizer=optimizer,
              valid_iter=valid_iter,
              early_stopping_batch=100,
              batch_check_frequency=2,
              print_every_batch=10,
              model_save_path=model_save_path,
              device=device)
    t2 = datetime.now()
    print(f"train cost {(t2-t1).seconds} seconds")

    # [8]. 模型预测
    # pred = clf.predict(data=train_iter, do_func=lambda x: x[0])

    # [9]. 查看效果
    def get_max_prob_index(pred):
        return torch.max(pred, 1)[1]

    # pred = torch.nn.functional.softmax(pred, dim=1).cpu().numpy()

    y_score, y_true = evaluate(clf.model,
                               train_iter,
                               y_score_processor=get_max_prob_index)
    train_acc = accuracy_score(y_true, y_score)
    y_score, y_true = evaluate(clf.model,
                               valid_iter,
                               y_score_processor=get_max_prob_index)
    valid_acc = accuracy_score(y_true, y_score)
    y_score, y_true = evaluate(clf.model,
                               test_iter,
                               y_score_processor=get_max_prob_index)
    test_acc = accuracy_score(y_true, y_score)
    print(
        f"train accuracy is {train_acc}, valid accuracy is {valid_acc}, test accuracy is {test_acc}."
    )