Esempio n. 1
0
def test():
    from fastNLP import DataSetIter, DataSet
    # 0 for not match,1 for match
    testset = DataSet({"raw_words": ["5::five"]})
    testset.apply(addWords, new_field_name="p_words")
    testset.apply(addWordPiece, new_field_name="t_words")
    testset.apply(processItem, new_field_name="word_pieces")
    testset.apply(processNum, new_field_name="word_nums")
    testset.apply(addSeqlen, new_field_name="seq_len")
    testset.field_arrays["word_pieces"].is_input = True
    testset.field_arrays["seq_len"].is_input = True
    testset.field_arrays["word_nums"].is_input = True
    # print(testset)
    from fastNLP.io import ModelLoader
    loader = ModelLoader()
    if torch.cuda.is_available():
        model = loader.load_pytorch_model(
            "../models/bert_model_max_triple.pkl")
    else:
        model = torch.load("../models/bert_model_max_triple.pkl",
                           map_location="cpu")

    model.eval()
    test_batch = DataSetIter(batch_size=1, dataset=testset, sampler=None)
    outputs = []
    for batch_x, batch_y in test_batch:
        _move_dict_value_to_device(batch_x,
                                   batch_y,
                                   device=_get_model_device(model))
        outputs.append(
            model.forward(batch_x["word_pieces"], batch_x["word_nums"],
                          batch_x["seq_len"])['pred'])
    outputs = torch.cat(outputs)
    outputs = torch.nn.functional.softmax(outputs, dim=1)
    return outputs
Esempio n. 2
0
 def test_case3(self):
     # 测试None
     model = Model()
     device = _get_model_device(model)
     model = _move_model_to_device(model, None)
     assert device == _get_model_device(model), "The device should not change."
     if torch.cuda.is_available():
         model.cuda()
         device = _get_model_device(model)
         model = _move_model_to_device(model, None)
         assert device == _get_model_device(model), "The device should not change."
         
         model = nn.DataParallel(model, device_ids=[0])
         _move_model_to_device(model, None)
         with self.assertRaises(Exception):
             _move_model_to_device(model, 'cpu')
Esempio n. 3
0
    def predict(self, data: DataSet, seq_len_field_name=None):
        r"""用已经训练好的模型进行inference.

        :param fastNLP.DataSet data: 待预测的数据集
        :param str seq_len_field_name: 表示序列长度信息的field名字
        :return: dict dict里面的内容为模型预测的结果
        """
        if not isinstance(data, DataSet):
            raise ValueError("Only Dataset class is allowed, not {}.".format(
                type(data)))
        if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays:
            raise ValueError("Field name {} not found in DataSet {}.".format(
                seq_len_field_name, data))

        prev_training = self.network.training
        self.network.eval()
        network_device = _get_model_device(self.network)
        batch_output = defaultdict(list)
        data_iterator = DataSetIter(data,
                                    batch_size=self.batch_size,
                                    sampler=SequentialSampler(),
                                    as_numpy=False)

        if hasattr(self.network, "predict"):
            predict_func = self.network.predict
        else:
            predict_func = self.network.forward

        with torch.no_grad():
            for batch_x, _ in data_iterator:
                _move_dict_value_to_device(batch_x, _, device=network_device)
                refined_batch_x = _build_args(predict_func, **batch_x)
                prediction = predict_func(**refined_batch_x)

                if seq_len_field_name is not None:
                    seq_lens = batch_x[seq_len_field_name].tolist()

                for key, value in prediction.items():
                    value = value.cpu().numpy()
                    if len(value.shape) == 1 or (len(value.shape) == 2
                                                 and value.shape[1] == 1):
                        batch_output[key].extend(value.tolist())
                    else:
                        if seq_len_field_name is not None:
                            tmp_batch = []
                            for idx, seq_len in enumerate(seq_lens):
                                tmp_batch.append(value[idx, :seq_len])
                            batch_output[key].extend(tmp_batch)
                        else:
                            batch_output[key].append(value)

        self.network.train(prev_training)
        return batch_output
Esempio n. 4
0
def produceCandidateTripleSlow(raw_phrase, Candidate_phrases, model,
                               Candidate_hpos_sub, threshold):
    """
    使用BERT判断Candidate_phrases中哪个与raw_phrase语义最接近;基于最大值方式;适用于单个处理
    """
    from fastNLP.core.utils import _move_dict_value_to_device
    from fastNLP.core.utils import _get_model_device
    from fastNLP import DataSet
    from fastNLP import DataSetIter
    from my_bert_match import addWordPiece, addSeqlen, addWords, processItem, processNum
    p_Candidate_phrases = [
        raw_phrase + "::" + item for item in Candidate_phrases
    ]
    Candidate_dataset = DataSet({"raw_words": p_Candidate_phrases})
    Candidate_dataset.apply(addWords, new_field_name="p_words")
    Candidate_dataset.apply(addWordPiece, new_field_name="t_words")
    Candidate_dataset.apply(processItem, new_field_name="word_pieces")
    Candidate_dataset.apply(processNum, new_field_name="word_nums")
    Candidate_dataset.apply(addSeqlen, new_field_name="seq_len")
    Candidate_dataset.field_arrays["word_pieces"].is_input = True
    Candidate_dataset.field_arrays["seq_len"].is_input = True
    Candidate_dataset.field_arrays["word_nums"].is_input = True
    test_batch = DataSetIter(batch_size=10,
                             dataset=Candidate_dataset,
                             sampler=None)

    outputs = []
    for batch_x, batch_y in test_batch:
        _move_dict_value_to_device(batch_x,
                                   batch_y,
                                   device=_get_model_device(model))
        outputs.append(
            model.forward(batch_x["word_pieces"], batch_x["word_nums"],
                          batch_x["seq_len"])['pred'])
    outputs = torch.cat(outputs)
    outputs = torch.nn.functional.softmax(outputs,
                                          dim=1).cpu().detach().numpy()

    results_2 = np.array([item[2] for item in outputs])
    results_1 = np.array([item[1] for item in outputs])

    # 如果这里已经能找到精确匹配的就直接输出
    if max(results_2) >= threshold:
        return Candidate_hpos_sub[int(
            np.argmax(results_2))], max(results_2), "2"

    if max(results_1) >= threshold:
        return Candidate_hpos_sub[int(
            np.argmax(results_1))], max(results_1), "1"

    return "None", None, "0"
Esempio n. 5
0
    def predict(self, data: DataSet, seq_len_field_name=None):
        r"""
        """
        if not isinstance(data, DataSet):
            raise ValueError(
                "Only Dataset class is allowed, not {}.".format(type(data)))
        if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays:
            raise ValueError("Field name {} not found in DataSet {}.".format(
                seq_len_field_name, data))

        self.network.eval()  # self.network.module for multi-GPU
        network_device = _get_model_device(self.network)
        batch_output = defaultdict(list)
        data_iterator = DataSetIter(
            data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)

        # predict_func = self.network.module.predict  # self.network.module for
        # multi-GPU
        try:
            predict_func = self.network.predict
        except ModuleAttributeError:
            predict_func = self.network.module.predict

        with torch.no_grad():
            #            for batch_x, _ in tqdm(data_iterator):
            for batch_x, _ in tqdm(data_iterator, total=len(data_iterator)):
                _move_dict_value_to_device(batch_x, _, device=network_device)
                refined_batch_x = _build_args(predict_func, **batch_x)
                prediction = predict_func(**refined_batch_x)
                if seq_len_field_name is not None:
                    seq_lens = batch_x[seq_len_field_name].tolist()

                for key, value in prediction.items():
                    value = value.cpu().numpy()
                    if len(value.shape) == 1 or (
                            len(value.shape) == 2 and value.shape[1] == 1):
                        batch_output[key].extend(value.tolist())
                    else:
                        if seq_len_field_name is not None:
                            tmp_batch = []
                            for idx, seq_len in enumerate(seq_lens):
                                tmp_batch.append(value[idx, :seq_len])
                            batch_output[key].extend(tmp_batch)
                        else:
                            batch_output[key].append(value)
        return batch_output
Esempio n. 6
0
def _save_model(model, model_name, save_dir, only_param=False):
    """ 存储不含有显卡信息的 state_dict 或 model
    :param model:
    :param model_name:
    :param save_dir: 保存的 directory
    :param only_param:
    :return:
    """
    model_path = os.path.join(save_dir, model_name)
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir, exist_ok=True)
    if isinstance(model, nn.DataParallel):
        model = model.module
    if only_param:
        state_dict = model.state_dict()
        for key in state_dict:
            state_dict[key] = state_dict[key].cpu()
        torch.save(state_dict, model_path)
    else:
        _model_device = _get_model_device(model)
        model.cpu()
        torch.save(model, model_path)
        model.to(_model_device)
Esempio n. 7
0
def produceCandidateTriple(Candidate_hpos_sub_total, model, hpo_tree,
                           threshold):
    """
    使用BERT判断Candidate_phrases中哪个与raw_phrase语义最接近;基于最大值方式
    :param Candidate_hpos_sub_total: 输出的短语及候选HPO嵌套列表
    :param model:
    :param hpo_tree:
    :param threshold: 用作该模型输出阈值
    :return:
    """
    from fastNLP.core.utils import _move_dict_value_to_device
    from fastNLP.core.utils import _get_model_device
    from fastNLP import DataSet
    from fastNLP import DataSetIter
    from my_bert_match import addWordPiece, addSeqlen, addWords, processItem, processNum
    p_Candidate_phrases = []
    phrase_nums_per_hpo = []
    Candidate_hpos = []
    for raw_phrase, Candidate_phrase, Candidate_hpos_sub in Candidate_hpos_sub_total:
        p_Candidate_phrases.extend(
            [raw_phrase + "::" + item for item in Candidate_phrase])
        phrase_nums_per_hpo.append(len(Candidate_phrase))
        Candidate_hpos.append(Candidate_hpos_sub)
    Candidate_dataset = DataSet({"raw_words": p_Candidate_phrases})
    Candidate_dataset.apply(addWords, new_field_name="p_words")
    Candidate_dataset.apply(addWordPiece, new_field_name="t_words")
    Candidate_dataset.apply(processItem, new_field_name="word_pieces")
    Candidate_dataset.apply(processNum, new_field_name="word_nums")
    Candidate_dataset.apply(addSeqlen, new_field_name="seq_len")
    Candidate_dataset.field_arrays["word_pieces"].is_input = True
    Candidate_dataset.field_arrays["seq_len"].is_input = True
    Candidate_dataset.field_arrays["word_nums"].is_input = True
    test_batch = DataSetIter(batch_size=128,
                             dataset=Candidate_dataset,
                             sampler=None)

    outputs = []
    for batch_x, batch_y in test_batch:
        _move_dict_value_to_device(batch_x,
                                   batch_y,
                                   device=_get_model_device(model))
        outputs.append(
            model.forward(batch_x["word_pieces"], batch_x["word_nums"],
                          batch_x["seq_len"])['pred'])
    outputs = torch.cat(outputs)
    outputs = torch.nn.functional.softmax(outputs,
                                          dim=1).cpu().detach().numpy()
    # print(outputs.size)
    results_2 = np.array([item[2] for item in outputs])
    results_1 = np.array([item[1] for item in outputs])

    # 按短语分组
    count = 0
    index = 0
    ans = []
    for group_num in phrase_nums_per_hpo:
        g_results_2 = results_2[index:index + group_num]
        g_results_1 = results_1[index:index + group_num]
        Candidate_hpos_sub = Candidate_hpos[count]
        index += group_num
        count += 1
        # 如果这里已经能找到精确匹配的就直接输出
        if max(g_results_2) >= threshold:
            ans.append([
                Candidate_hpos_sub[int(np.argmax(g_results_2))],
                max(g_results_2), "2"
            ])
            continue
        if max(g_results_1) >= threshold:
            ans.append([
                Candidate_hpos_sub[int(np.argmax(g_results_1))],
                max(g_results_1), "1"
            ])
            continue
        ans.append(["None", None, "0"])
    return ans
Esempio n. 8
0
def _beam_search_generate(decoder: Decoder,
                          tokens=None,
                          past=None,
                          max_length=20,
                          num_beams=4,
                          temperature=1.0,
                          top_k=50,
                          top_p=1.0,
                          bos_token_id=None,
                          eos_token_id=None,
                          do_sample=True,
                          repetition_penalty=1.0,
                          length_penalty=None,
                          pad_token_id=0) -> torch.LongTensor:
    # 进行beam search
    device = _get_model_device(decoder)
    if tokens is None:
        if bos_token_id is None:
            raise RuntimeError(
                "You have to specify either `tokens` or `bos_token_id`.")
        if past is None:
            raise RuntimeError(
                "You have to specify either `past` or `tokens`.")
        batch_size = past.num_samples()
        if batch_size is None:
            raise RuntimeError(
                "Cannot infer the number of samples from `past`.")
        tokens = torch.full([batch_size, 1],
                            fill_value=bos_token_id,
                            dtype=torch.long).to(device)
    batch_size = tokens.size(0)
    if past is not None:
        assert past.num_samples(
        ) == batch_size, "The number of samples in `tokens` and `past` should match."

    # for i in range(tokens.size(1) - 1):  # 如果输入的长度较长,先decode
    #     scores, past = decoder.decode_one(tokens[:, :i + 1],
    #                                       past)  # (batch_size, vocab_size), Past
    # scores, past = decoder.decode_one(tokens, past)  # 这里要传入的是整个句子的长度
    scores, past = decoder.decode(tokens, past)  # 这里要传入的是整个句子的长度
    vocab_size = scores.size(1)
    assert vocab_size >= num_beams, "num_beams should be smaller than the number of vocabulary size."

    if do_sample:
        probs = F.softmax(scores, dim=-1) + 1e-12
        next_tokens = torch.multinomial(
            probs, num_samples=num_beams)  # (batch_size, num_beams)
        logits = probs.log()
        next_scores = logits.gather(
            dim=1, index=next_tokens)  # (batch_size, num_beams)
    else:
        scores = F.log_softmax(scores, dim=-1)  # (batch_size, vocab_size)
        # 得到(batch_size, num_beams), (batch_size, num_beams)
        next_scores, next_tokens = torch.topk(scores,
                                              num_beams,
                                              dim=1,
                                              largest=True,
                                              sorted=True)

    indices = torch.arange(batch_size, dtype=torch.long).to(device)
    indices = indices.repeat_interleave(num_beams)
    decoder.reorder_past(indices, past)

    tokens = tokens.index_select(
        dim=0, index=indices)  # batch_size * num_beams x length
    # 记录生成好的token (batch_size', cur_len)
    token_ids = torch.cat([tokens, next_tokens.view(-1, 1)], dim=-1)
    dones = [False] * batch_size
    tokens = next_tokens.view(-1, 1)

    beam_scores = next_scores.view(-1)  # batch_size * num_beams

    #  用来记录已经生成好的token的长度
    cur_len = token_ids.size(1)

    hypos = [
        BeamHypotheses(num_beams,
                       max_length,
                       length_penalty,
                       early_stopping=False) for _ in range(batch_size)
    ]
    # 0,num_beams, 2*num_beams, ...
    batch_inds_with_numbeams_interval = (torch.arange(batch_size) *
                                         num_beams).view(-1, 1).to(token_ids)

    while cur_len < max_length:
        # scores, past = decoder.decode_one(tokens, past)  # batch_size * num_beams x vocab_size, Past
        scores, past = decoder.decode(tokens, past)
        if repetition_penalty != 1.0:
            token_scores = scores.gather(dim=1, index=token_ids)
            lt_zero_mask = token_scores.lt(0).float()
            ge_zero_mask = lt_zero_mask.eq(0).float()
            token_scores = lt_zero_mask * repetition_penalty * token_scores + ge_zero_mask / repetition_penalty * token_scores
            scores.scatter_(dim=1, index=token_ids, src=token_scores)

        if do_sample:
            if temperature > 0 and temperature != 1:
                scores = scores / temperature

            # 多召回一个防止eos
            scores = top_k_top_p_filtering(scores,
                                           top_k,
                                           top_p,
                                           min_tokens_to_keep=num_beams + 1)
            # 加上1e-12是为了避免https://github.com/pytorch/pytorch/pull/27523
            probs = F.softmax(scores, dim=-1) + 1e-12

            # 保证至少有一个不是eos的值
            _tokens = torch.multinomial(probs, num_samples=num_beams +
                                        1)  # batch_size' x (num_beams+1)

            logits = probs.log()
            # 防止全是这个beam的被选中了,且需要考虑eos被选择的情况
            _scores = logits.gather(
                dim=1, index=_tokens)  # batch_size' x (num_beams+1)
            _scores = _scores + beam_scores[:,
                                            None]  # batch_size' x (num_beams+1)
            # 从这里面再选择top的2*num_beam个
            _scores = _scores.view(batch_size, num_beams * (num_beams + 1))
            next_scores, ids = _scores.topk(2 * num_beams,
                                            dim=1,
                                            largest=True,
                                            sorted=True)
            _tokens = _tokens.view(batch_size, num_beams * (num_beams + 1))
            next_tokens = _tokens.gather(
                dim=1, index=ids)  # (batch_size, 2*num_beams)
            from_which_beam = ids // (num_beams + 1
                                      )  # (batch_size, 2*num_beams)
        else:
            scores = F.log_softmax(
                scores, dim=-1)  # (batch_size * num_beams, vocab_size)
            _scores = scores + beam_scores[:,
                                           None]  # (batch_size * num_beams, vocab_size)
            _scores = _scores.view(batch_size,
                                   -1)  # (batch_size, num_beams*vocab_size)
            next_scores, ids = torch.topk(_scores,
                                          2 * num_beams,
                                          dim=1,
                                          largest=True,
                                          sorted=True)
            from_which_beam = ids // vocab_size  # (batch_size, 2*num_beams)
            next_tokens = ids % vocab_size  # (batch_size, 2*num_beams)

        #  接下来需要组装下一个batch的结果。
        #  需要选定哪些留下来
        next_scores, sorted_inds = next_scores.sort(dim=-1, descending=True)
        next_tokens = next_tokens.gather(dim=1, index=sorted_inds)
        from_which_beam = from_which_beam.gather(dim=1, index=sorted_inds)

        not_eos_mask = next_tokens.ne(eos_token_id)  # 为1的地方不是eos
        keep_mask = not_eos_mask.cumsum(dim=1).le(num_beams)  # 为1的地方需要保留
        keep_mask = not_eos_mask.__and__(keep_mask)  # 为1的地方是需要进行下一步search的

        _next_tokens = next_tokens.masked_select(keep_mask).view(-1, 1)
        _from_which_beam = from_which_beam.masked_select(keep_mask).view(
            batch_size, num_beams)  # 上面的token是来自哪个beam
        _next_scores = next_scores.masked_select(keep_mask).view(
            batch_size, num_beams)
        beam_scores = _next_scores.view(-1)

        # 更改past状态, 重组token_ids
        reorder_inds = (batch_inds_with_numbeams_interval +
                        _from_which_beam).view(-1)  # flatten成一维
        decoder.reorder_past(reorder_inds, past)

        flag = True
        if cur_len + 1 == max_length:
            eos_batch_idx = torch.arange(batch_size).to(
                next_tokens).repeat_interleave(repeats=num_beams, dim=0)
            eos_beam_ind = torch.arange(num_beams).to(token_ids).repeat(
                batch_size)  # 表示的是indice
            eos_beam_idx = from_which_beam[:, :num_beams].reshape(
                -1)  # 表示的是从哪个beam获取得到的
        else:
            # 将每个batch中在num_beam内的序列添加到结束中, 为1的地方需要结束了
            effective_eos_mask = next_tokens[:, :num_beams].eq(
                eos_token_id)  # batch_size x num_beams
            if effective_eos_mask.sum().gt(0):
                eos_batch_idx, eos_beam_ind = effective_eos_mask.nonzero(
                    as_tuple=True)
                # 是由于from_which_beam是 (batch_size, 2*num_beams)的,所以需要2*num_beams
                eos_beam_idx = eos_batch_idx * num_beams * 2 + eos_beam_ind
                eos_beam_idx = from_which_beam.view(-1)[
                    eos_beam_idx]  # 获取真实的从哪个beam获取的eos
            else:
                flag = False
        if flag:
            for batch_idx, beam_ind, beam_idx in zip(eos_batch_idx.tolist(),
                                                     eos_beam_ind.tolist(),
                                                     eos_beam_idx.tolist()):
                if not dones[batch_idx]:
                    score = next_scores[batch_idx, beam_ind].item()
                    hypos[batch_idx].add(
                        token_ids[batch_idx * num_beams +
                                  beam_idx, :cur_len].clone(), score)

        # 重新组织token_ids的状态
        tokens = _next_tokens
        token_ids = torch.cat(
            [token_ids.index_select(index=reorder_inds, dim=0), tokens],
            dim=-1)

        for batch_idx in range(batch_size):
            dones[batch_idx] = dones[batch_idx] or hypos[batch_idx].is_done(
                next_scores[batch_idx, 0].item())

        cur_len += 1

        if all(dones):
            break

    # select the best hypotheses
    tgt_len = token_ids.new(batch_size)
    best = []

    for i, hypotheses in enumerate(hypos):
        best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1]
        tgt_len[i] = len(best_hyp) + 1  # +1 for the <EOS> symbol
        best.append(best_hyp)

    # generate target batch
    decoded = token_ids.new(batch_size,
                            tgt_len.max().item()).fill_(pad_token_id)
    for i, hypo in enumerate(best):
        decoded[i, :tgt_len[i] - 1] = hypo
        if eos_token_id is not None:
            decoded[i, tgt_len[i] - 1] = eos_token_id

    return decoded
Esempio n. 9
0
def _no_beam_search_generate(decoder: Decoder,
                             tokens=None,
                             past=None,
                             max_length=20,
                             temperature=1.0,
                             top_k=50,
                             top_p=1.0,
                             bos_token_id=None,
                             eos_token_id=None,
                             do_sample=True,
                             repetition_penalty=1.0,
                             length_penalty=1.0,
                             pad_token_id=0):
    device = _get_model_device(decoder)
    if tokens is None:
        if bos_token_id is None:
            raise RuntimeError(
                "You have to specify either `tokens` or `bos_token_id`.")
        if past is None:
            raise RuntimeError(
                "You have to specify either `past` or `tokens`.")
        batch_size = past.num_samples()
        if batch_size is None:
            raise RuntimeError(
                "Cannot infer the number of samples from `past`.")
        tokens = torch.full([batch_size, 1],
                            fill_value=bos_token_id,
                            dtype=torch.long).to(device)
    batch_size = tokens.size(0)
    if past is not None:
        assert past.num_samples(
        ) == batch_size, "The number of samples in `tokens` and `past` should match."

    if eos_token_id is None:
        _eos_token_id = float('nan')
    else:
        _eos_token_id = eos_token_id

    # for i in range(tokens.size(1)):
    #     scores, past = decoder.decode_one(tokens[:, :i + 1], past)  # batch_size x vocab_size, Past
    scores, past = decoder.decode(tokens, past)

    token_ids = tokens.clone()
    cur_len = token_ids.size(1)
    dones = token_ids.new_zeros(batch_size).eq(1)
    # tokens = tokens[:, -1:]

    while cur_len < max_length:
        # scores, past = decoder.decode_one(tokens, past)  # batch_size x vocab_size, Past
        scores, past = decoder.decode(tokens,
                                      past)  # batch_size x vocab_size, Past

        if repetition_penalty != 1.0:
            token_scores = scores.gather(dim=1, index=token_ids)
            lt_zero_mask = token_scores.lt(0).float()
            ge_zero_mask = lt_zero_mask.eq(0).float()
            token_scores = lt_zero_mask * repetition_penalty * token_scores + ge_zero_mask / repetition_penalty * token_scores
            scores.scatter_(dim=1, index=token_ids, src=token_scores)

        if eos_token_id is not None and length_penalty != 1.0:
            token_scores = scores / cur_len**length_penalty  # batch_size x vocab_size
            eos_mask = scores.new_ones(scores.size(1))
            eos_mask[eos_token_id] = 0
            eos_mask = eos_mask.unsqueeze(0).eq(1)
            scores = scores.masked_scatter(
                eos_mask, token_scores)  # 也即除了eos,其他词的分数经过了放大/缩小

        if do_sample:
            if temperature > 0 and temperature != 1:
                scores = scores / temperature

            scores = top_k_top_p_filtering(scores,
                                           top_k,
                                           top_p,
                                           min_tokens_to_keep=2)
            # 加上1e-12是为了避免https://github.com/pytorch/pytorch/pull/27523
            probs = F.softmax(scores, dim=-1) + 1e-12

            # 保证至少有一个不是eos的值
            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(
                1)  # batch_size
        else:
            next_tokens = torch.argmax(scores, dim=-1)  # batch_size

        next_tokens = next_tokens.masked_fill(
            dones, pad_token_id)  # 对已经搜索完成的sample做padding
        tokens = next_tokens.unsqueeze(1)

        token_ids = torch.cat([token_ids, tokens],
                              dim=-1)  # batch_size x max_len

        end_mask = next_tokens.eq(_eos_token_id)
        dones = dones.__or__(end_mask)
        cur_len += 1

        if dones.min() == 1:
            break

    if eos_token_id is not None:
        if cur_len == max_length:
            token_ids[:, -1].masked_fill_(
                ~dones, eos_token_id)  # 若到最长长度仍未到EOS,则强制将最后一个词替换成eos

    return token_ids
Esempio n. 10
0
def predict(instance):
    x_batch = torch.LongTensor([instance['words']])
    x_batch = x_batch.to(device=_get_model_device(model))
    pred = model.predict(x_batch)
    pred = vocab_target.to_word(int(pred['pred']))
    return pred
Esempio n. 11
0
def train():
    n_epochs = 10
    train_set = data_set_loader._load('../models/all4bert_new_triple.txt')
    train_set, tmp_set = train_set.split(0.2)
    val_set, test_set = tmp_set.split(0.5)
    data_bundle = [train_set, val_set, test_set]

    for dataset in data_bundle:
        dataset.apply(addWords, new_field_name="p_words")
        dataset.apply(addWordPiece, new_field_name="t_words")
        dataset.apply(processItem, new_field_name="word_pieces")
        dataset.apply(processNum, new_field_name="word_nums")
        dataset.apply(addSeqlen, new_field_name="seq_len")
        dataset.apply(processTarget, new_field_name="target")

    for dataset in data_bundle:
        dataset.field_arrays["word_pieces"].is_input = True
        dataset.field_arrays["seq_len"].is_input = True
        dataset.field_arrays["word_nums"].is_input = True
        dataset.field_arrays["target"].is_target = True

    print("In total " + str(len(data_bundle)) + " datasets:")
    print("Trainset has " + str(len(train_set)) + " instances.")
    print("Validateset has " + str(len(val_set)) + " instances.")
    print("Testset has " + str(len(test_set)) + " instances.")
    train_set.print_field_meta()
    # print(train_set)
    from fastNLP.models.Mybert import BertForSentenceMatching
    from fastNLP import AccuracyMetric, DataSetIter

    from fastNLP.core.utils import _pseudo_tqdm as tqdm
    # 注意这里是表明分的类数
    model = BertForSentenceMatching(embed, 3)
    if torch.cuda.is_available():
        model = _move_model_to_device(model, device=0)
    # print(model)
    train_batch = DataSetIter(batch_size=16, dataset=train_set, sampler=None)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    Lossfunc = torch.nn.CrossEntropyLoss()
    with tqdm(total=n_epochs,
              postfix='loss:{0:<6.5f}',
              leave=False,
              dynamic_ncols=True) as pbar:
        print_every = 10
        for epoch in range(1, n_epochs + 1):
            pbar.set_description_str(
                desc="Epoch {}/{}".format(epoch, n_epochs))
            avg_loss = 0
            step = 0
            for batch_x, batch_y in train_batch:
                step += 1
                _move_dict_value_to_device(batch_x,
                                           batch_y,
                                           device=_get_model_device(model))
                optimizer.zero_grad()
                output = model.forward(batch_x["word_pieces"],
                                       batch_x["word_nums"],
                                       batch_x["seq_len"])
                loss = Lossfunc(output['pred'], batch_y['target'])
                loss.backward()
                optimizer.step()
                avg_loss += loss.item()
                if step % print_every == 0:
                    avg_loss = float(avg_loss) / print_every
                    print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6}".format(
                        epoch, step, avg_loss)
                    pbar.update(print_every)
                    pbar.set_postfix_str(print_output)
                    avg_loss = 0
            metric = AccuracyMetric()
            val_batch = DataSetIter(batch_size=8,
                                    dataset=val_set,
                                    sampler=None)
            for batch_x, batch_y in val_batch:
                _move_dict_value_to_device(batch_x,
                                           batch_y,
                                           device=_get_model_device(model))
                output = model.predict(batch_x["word_pieces"],
                                       batch_x["word_nums"],
                                       batch_x["seq_len"])
                metric(output, batch_y)
            eval_result = metric.get_metric()
            print("ACC on Validate Set:", eval_result)
            from fastNLP.io import ModelSaver
            saver = ModelSaver("../models/bert_model_max_triple.pkl")
            saver.save_pytorch(model, param_only=False)
        pbar.close()
    metric = AccuracyMetric()
    test_batch = DataSetIter(batch_size=8, dataset=test_set, sampler=None)
    for batch_x, batch_y in test_batch:
        _move_dict_value_to_device(batch_x,
                                   batch_y,
                                   device=_get_model_device(model))
        output = model.predict(batch_x["word_pieces"], batch_x["word_nums"],
                               batch_x["seq_len"])
        metric(output, batch_y)
    eval_result = metric.get_metric()
    print("ACC on Test Set:", eval_result)
    from fastNLP.io import ModelSaver
    saver = ModelSaver("../models/bert_model_max_triple.pkl")
    saver.save_pytorch(model, param_only=False)