コード例 #1
0
def load_data(opt):
    if args.raw_data:
        test = load_origin_data(collection_test, args, single_answer=False)
    else:
        if args.tokenizer == 'ltp':
            tokenizer = LtpTokenizer('ltp_data')
        if args.tokenizer == 'jieba':
            tokenizer = JiebaTokenizer()
        if args.tokenizer == 'jieba_origin':
            tokenizer = JiebaOriginTokenizer()
        test = load_data_tokenize(collection_test,
                                  tokenizer,
                                  args,
                                  single_answer=False)

    log.info("[test data length:{}]".format(len(test)))

    with open('data/meta.msgpack', 'rb') as f:
        meta = msgpack.load(f, encoding='utf8')
    opt['pos_size'] = len(meta['vocab_tag'])
    opt['ner_size'] = len(meta['vocab_ent'])
    vocab = meta['vocab']
    vocab_tag = meta['vocab_tag']
    vocab_ent = meta['vocab_ent']

    embedding = meta['embedding']
    embedding = torch.Tensor(embedding)

    question_tokens = list(test.query_words)
    context_tokens = list(test.words)
    context_tags = list(test.postags)
    context_ents = list(test.netags)

    opt['pretrained_words'] = True
    opt['vocab_size'] = embedding.size(0)
    opt['embedding_dim'] = embedding.size(1)
    embedding[1] = torch.normal(means=torch.zeros(opt['embedding_dim']),
                                std=1.)

    question_ids = token2id(question_tokens, vocab, unk_id=1)
    context_ids = token2id(context_tokens, vocab, unk_id=1)
    context_features = get_context_features(question_tokens, context_tokens)

    context_tag_ids = token2id(context_tags, vocab_tag)
    context_ent_ids = token2id(context_ents, vocab_ent)

    test_batches = list(
        zip(
            context_ids,
            context_features,
            context_tag_ids,
            context_ent_ids,
            question_ids,
            context_tokens,
        ))

    test_y = test['answers'].tolist()[:len(test)]
    return test_batches, test_y, embedding, opt, test
コード例 #2
0
def sim_test():

    samples = generate_dataset(100)

    word2id, id2word = build_vocab(samples)
    vocab_size = len(word2id)
    inputs = []
    targets = []

    rnn = RNN(input_dim=vocab_size, output_dim=vocab_size, hidden_dim=256)

    test_input = text2token(samples[0])[:-1]
    test_target = text2token(samples[0])[1:]
    print("Test Input:", test_input)
    print("Test Target:", test_target)
    inputs = one_hot_seq(token2id(test_input, word2id), vocab_size)
    outputs, hidden_states = rnn.forward(inputs)
    test_output = [id2word[np.argmax(out)] for out in outputs]
    print("Test Output:", test_output)

    for epoch in range(150):
        losses = []
        for sample in samples:
            ids = token2id(text2token(sample), word2id)
            inputs = one_hot_seq(ids[:-1], vocab_size)
            targets = one_hot_seq(ids[1:], vocab_size)
            #print(inputs[0].shape)

            outputs, hidden_states = rnn.forward(inputs)
            #print(rnn.grads['d_W_x'])
            rnn.zero_grad()
            loss = rnn.backward(outputs, targets)

            rnn.update_params(lr=2e-4)
            losses.append(loss)
            #print(loss)
        print(np.array(losses).mean())

    print("Test Input:", test_input)
    print("Test Target:", test_target)
    inputs = one_hot_seq(token2id(test_input, word2id), vocab_size)
    outputs, hidden_states = rnn.forward(inputs)
    test_output = [id2word[np.argmax(out)] for out in outputs]
    print("Test Output:", test_output)
コード例 #3
0
ファイル: run.py プロジェクト: z-Runmin/convseg_pytorch
def evaluate(model, dev_data, batch_size, max_sent_length, device):
    was_training = model.training
    model.eval()

    vocab_tag = model.vocab_tag
    tok2idx, idx2tok = vocab_tag['token_to_index'], vocab_tag['index_to_token']
    tag2idx, idx2tag = vocab_tag['tag_to_index'], vocab_tag['index_to_tag']

    data_loader = DataLoader(dev_data, batch_size=batch_size)
    val_loss = 0
    cum_cnt = 0
    with torch.no_grad():
        for iter_idx, data in enumerate(data_loader):
            chars_batch, tags_batch = get_feature(data)
            chars_batch, chars_mask = pad_sents(chars_batch,
                                                pad_token,
                                                max_len=max_sent_length)
            tags_batch, tags_mask = pad_sents(tags_batch,
                                              pad_token,
                                              max_len=max_sent_length)

            input_chars = torch.tensor(token2id(chars_batch,
                                                tok2idx,
                                                unk_id=tok2idx['<UNK>']),
                                       device=device)
            target_tags = torch.tensor(token2id(tags_batch, tag2idx),
                                       device=device)
            tags_mask = torch.tensor(tags_mask,
                                     dtype=torch.uint8,
                                     device=device)

            #            output_emission = conv_seg(input_chars.to(device))
            #            loss = -crf_model(output_emission.transpose(0,1), target_tags.transpose(0,1).to(device), tags_mask.transpose(0,1).to(device))
            loss = -model(input_chars, target_tags, tags_mask)
            val_loss += loss
            cum_cnt = cum_cnt + input_chars.shape[0]
    val_loss = val_loss / cum_cnt

    if was_training:
        model.train()

    return val_loss
コード例 #4
0
def test(model_path, data_path, split, batch_size, max_sent_length, cuda):
    device = torch.device("cuda:0" if cuda else "cpu")
    dataset = SIGHAN(split='dev', root_path=data_path)
    model = CharWordSeg.load(model_path)
    model = model.to(device)
    model.eval()

    vocab_tag = model.vocab_tag
    tok2idx, idx2tok = vocab_tag['token_to_index'], vocab_tag['index_to_token']
    tag2idx, idx2tag = vocab_tag['tag_to_index'], vocab_tag['index_to_tag']

    data_loader = DataLoader(dataset, batch_size=batch_size)
    val_loss = 0
    cum_cnt = 0
    with torch.no_grad():
        for iter_idx, data in enumerate(data_loader):
            chars_batch, tags_batch = get_feature(data)
            chars_batch, chars_mask = pad_sents(chars_batch,
                                                pad_token,
                                                max_len=max_sent_length)
            tags_batch, tags_mask = pad_sents(tags_batch,
                                              pad_token,
                                              max_len=max_sent_length)

            input_chars = torch.tensor(token2id(chars_batch,
                                                tok2idx,
                                                unk_id=tok2idx['<UNK>']),
                                       device=device)
            target_tags = torch.tensor(token2id(tags_batch, tag2idx),
                                       device=device)
            tags_mask = torch.tensor(tags_mask,
                                     dtype=torch.uint8,
                                     device=device)

            loss = -model(input_chars, target_tags, tags_mask)
            val_loss += loss
            cum_cnt = cum_cnt + input_chars.shape[0]
    val_loss = val_loss / cum_cnt
    return val_loss
コード例 #5
0
def real_test():
    if not os.path.exists('train.csv'):
        prepare_data()

    data = pd.read_csv('train.csv')
    texts = data['text'].tolist()
    labels = data['target'].tolist()

    word2id, id2word = build_vocab(texts)
    vocab_size = len(word2id)

    ids_list = []
    for text in texts:
        ids_list.append(token2id(text2token(text), word2id))

    num_class = len(set(labels))
    rnn = RNN(input_dim=vocab_size, output_dim=num_class, hidden_dim=256)

    accs = []
    ids_list = ids_list[:100]
    labels = labels[:100]
    for epoch in range(10):
        losses = []
        for inputs, label in zip(ids_list, labels):
            inputs = one_hot_seq(inputs, vocab_size)
            label = one_hot_seq([label], num_class)[0]
            outputs, hidden_states = rnn.forward(inputs)
            #print(rnn.grads['d_W_x'])
            pred = outputs[-1]
            rnn.zero_grad()
            loss = rnn.backward(pred, label)
            accs.append(np.argmax(label) == np.argmax(pred))
            rnn.update_params(lr=2e-4)
            losses.append(loss)
            #print(loss)
        print("Epoch", epoch, "Loss:",
              np.array(losses).mean(), "Acc:",
              np.array(accs).mean())
コード例 #6
0
def predict(model_path, data_path, split, output_path, batch_size,
            max_sent_length, cuda):
    device = torch.device("cuda:0" if cuda else "cpu")
    dataset = SIGHAN(split=split, root_path=data_path)
    output = []

    model = CharWordSeg.load(model_path)
    model = model.to(device)
    model.eval()

    vocab_tag = model.vocab_tag
    tok2idx, idx2tok = vocab_tag['token_to_index'], vocab_tag['index_to_token']
    tag2idx, idx2tag = vocab_tag['tag_to_index'], vocab_tag['index_to_tag']

    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    with torch.no_grad():
        for iter_idx, data in enumerate(data_loader):
            chars = get_feature(data, stage='predict')
            chars_batch, chars_mask = pad_sents(chars,
                                                pad_token,
                                                max_len=max_sent_length)

            input_chars = torch.tensor(token2id(chars_batch,
                                                tok2idx,
                                                unk_id=tok2idx['<UNK>']),
                                       device=device)
            chars_mask = torch.tensor(chars_mask,
                                      dtype=torch.uint8,
                                      device=device)

            pred_tags = id2token(model.decode(input_chars, chars_mask),
                                 idx2tag)
            output.extend(create_output(chars, pred_tags))

    with codecs.open(output_path, 'w', 'utf8') as f:
        for sent in output:
            print(sent, file=f)
コード例 #7
0
ファイル: prepro.py プロジェクト: MRKINKI/nlp_project
    context_tokens = list(train.words) + list(dev.words)

    #加载词向量文件中词典
    wv_vocab = load_wv_vocab(os.path.join('embedding', args.wv_file))
    log.info('wv_vocab loaded.vocab_size:'.format(len(wv_vocab)))

    #建立训练、验证数据集对应词表
    vocab, counter = build_vocab(question_tokens, context_tokens, wv_vocab)

    #加载词向量文件中词向量
    embedding = build_embedding(os.path.join('embedding', args.wv_file), vocab,
                                args.embedding_dim)
    log.info('got embedding matrix.')

    #建立词到序号的映射
    question_ids = token2id(question_tokens, vocab, unk_id=1)
    context_ids = token2id(context_tokens, vocab, unk_id=1)

    #提取文档中词的特征
    context_features = get_context_features(question_tokens, context_tokens)

    #数据集中词性及实体标记的类别
    vocab_tag = get_vocab(context_tags)
    vocab_ent = get_vocab(context_ents)

    log.info('Found {} POS tags: {}'.format(len(vocab_tag), vocab_tag))
    log.info('Found {} entity tags: {}'.format(len(vocab_ent), vocab_ent))

    #建立词性、实体标记到序号的映射
    context_tag_ids = token2id(context_tags, vocab_tag)
    context_ent_ids = token2id(context_ents, vocab_ent)
コード例 #8
0
ファイル: run.py プロジェクト: z-Runmin/convseg_pytorch
def train(split, data_path, save_to, batch_size, max_sent_length,
          char_embed_size, num_hidden_layer, channel_size, kernel_size,
          learning_rate, max_epoch, log_every, patience, max_num_trial,
          lr_decay, last_model_path, cuda):
    if not Path(save_to).exists():
        Path(save_to).mkdir()

    device = torch.device("cuda:0" if cuda else "cpu")
    dataset = SIGHAN(split=split, root_path=data_path)
    val_dataset = SIGHAN(split='dev', root_path=data_path)

    # build vocab
    train_chars, train_tags = get_feature(dataset.data)
    vocab = build_vocab(chain.from_iterable(train_chars))
    tok2idx, idx2tok = add_id(vocab)
    tags = build_vocab(["B", "M", "E", "S"], add_unk=False, add_pad=True)
    tag2idx, idx2tag = add_id(tags)

    # build model
    # conv_seg = CharWordSeg(len(tags), len(vocab), char_embed_size, num_hidden_layer, channel_size, kernel_size).to(device)
    # crf_model = CRF(num_tags=5).to(device)
    # conv_seg.train()
    # crf_model.train()
    # optimizer = torch.optim.Adam(list(conv_seg.parameters())+list(crf_model.parameters()), lr=learning_rate)
    vocab_tag = {
        "token_to_index": tok2idx,
        "index_to_token": idx2tok,
        "tag_to_index": tag2idx,
        "index_to_tag": idx2tag
    }

    data_loader = DataLoader(dataset, batch_size=batch_size)
    model = CharWordSeg(vocab_tag, char_embed_size, num_hidden_layer,
                        channel_size, kernel_size).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if last_model_path is not None:
        # load model
        logging.info(f'load model from  {last_model_path}')
        params = torch.load(last_model_path,
                            map_location=lambda storage, loc: storage)
        model.load_state_dict(params['state_dict'])
        logging.info('restore parameters of the optimizers')
        optimizer.load_state_dict(torch.load(last_model_path + '.optim'))

    model.train()

    epoch = 0
    cur_trial = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    logging.info("begin training!")
    while True:
        epoch += 1
        train_loss = 0
        cum_cnt = 0
        for iter_idx, data in enumerate(data_loader):
            optimizer.zero_grad()
            chars_batch, tags_batch = get_feature(data)
            chars_batch, chars_mask = pad_sents(chars_batch,
                                                pad_token,
                                                max_len=max_sent_length)
            tags_batch, tags_mask = pad_sents(tags_batch,
                                              pad_token,
                                              max_len=max_sent_length)

            input_chars = torch.tensor(token2id(chars_batch,
                                                tok2idx,
                                                unk_id=tok2idx['<UNK>']),
                                       device=device)
            target_tags = torch.tensor(token2id(tags_batch, tag2idx),
                                       device=device)
            tags_mask = torch.tensor(tags_mask,
                                     dtype=torch.uint8,
                                     device=device)

            #            output_emission = conv_seg(input_chars.to(device))
            #            loss = -crf_model(output_emission.transpose(0,1), target_tags.transpose(0,1).to(device), tags_mask.transpose(0,1).to(device))
            loss = -model(input_chars, target_tags, tags_mask)
            train_loss += loss
            cum_cnt = cum_cnt + input_chars.shape[0]
            loss.backward()
            optimizer.step()

        train_loss = train_loss / cum_cnt
        val_loss = evaluate(model, val_dataset, batch_size, max_sent_length,
                            device)

        logging.info(
            f'epoch {epoch}\t train_loss: {train_loss}\t val_loss:{val_loss}\t  speed:{time.time()-train_time:.2f}s/epoch\t time elapsed {time.time()-begin_time:.2f}s'
        )
        train_time = time.time()

        is_better = len(
            hist_valid_scores) == 0 or val_loss < min(hist_valid_scores)
        hist_valid_scores.append(val_loss)

        if epoch % log_every == 0:
            model.save(f"{save_to}/model_step_{epoch}")
            torch.save(optimizer.state_dict(),
                       f"{save_to}/model_step_{epoch}.optim")
        if is_better:
            cur_patience = 0
            model_save_path = f"{save_to}/model_best"
            print(f'save currently the best model to [{model_save_path}]')
            model.save(model_save_path)
            # also save the optimizers' state
            torch.save(optimizer.state_dict(), model_save_path + '.optim')
        elif cur_patience < patience:
            cur_patience += 1
            print('hit patience %d' % cur_patience)

            if cur_patience == patience:
                cur_trial += 1
                print(f'hit #{cur_trial} trial')
                if cur_trial == max_num_trial:
                    print('early stop!')
                    exit(0)

                # decay lr, and restore from previously best checkpoint
                lr = optimizer.param_groups[0]['lr'] * lr_decay
                logging.info(
                    f'load previously best model and decay learning rate to {lr}'
                )

                # load model
                params = torch.load(model_save_path,
                                    map_location=lambda storage, loc: storage)
                model.load_state_dict(params['state_dict'])
                model = model.to(device)

                logging.info('restore parameters of the optimizers')
                optimizer.load_state_dict(
                    torch.load(model_save_path + '.optim'))

                # set new lr
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

                # reset patience
                cur_patience = 0

        if epoch == max_epoch:
            print('reached maximum number of epochs!')
            exit(0)