Exemple #1
0
def paragraphs2batch(paragraphs: List[str], tokenizer: BertTokenizerFast) -> \
        Tuple[List[List[str]], Dict]:
    """
    Convert a list of paragraphs to a batch. This essentially does these things:

    1. Tokenize paragraphs.
    2. Pad all input_ids tensors, remove excessively long input_ids.
    3. Generate the correct attention_masks.

    :param paragraphs: List of paragraphs.
    :param tokenizer: The BERT tokenizer.
    :return: Tokenized paragraphs and the batch that can be used as model inputs.
    """
    all_tokenized = []
    input_ids = []
    attention_mask = []

    for p in paragraphs:
        tokenized = tokenizer.tokenize(p)
        all_tokenized.append(tokenized)
        one_hot = tokenizer.convert_tokens_to_ids(tokenized)

        input_ids.append(torch.tensor(one_hot, dtype=torch.long))
        attention_mask.append(torch.ones_like(input_ids[-1],
                                              dtype=torch.float))

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    input_ids = input_ids[:, :max_input]

    attention_mask = pad_sequence(attention_mask,
                                  batch_first=True,
                                  padding_value=0.)
    attention_mask = attention_mask[:, :max_input]

    return all_tokenized, {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
    }
Exemple #2
0
def _collate_xtokens(xtoken_df: pd.DataFrame, xtokenizer: BertTokenizerFast, pad) -> pd.DataFrame:
    sent_groups = xtoken_df.groupby(xtoken_df.sent_id)
    num_sentences = len(sent_groups)
    max_sent_len = max([len(sent_df) for sent_id, sent_df in sent_groups])
    data_rows = []
    tq = tqdm(total=num_sentences, desc="Sentence")
    for sent_id, sent_df in sent_groups:
        sent_index = list(sent_df.sent_id)
        sent_token_index = list(sent_df.token_id)
        sent_tokens = list(sent_df.token)
        sent_xtokens = list(sent_df.xtoken)
        sent_xtoken_ids = xtokenizer.convert_tokens_to_ids(sent_xtokens)
        pad_len = max_sent_len - len(sent_index)
        sent_index.extend(sent_index[-1:] * pad_len)
        sent_tokens.extend([pad] * pad_len)
        sent_token_index.extend([-1] * pad_len)
        sent_xtokens.extend([xtokenizer.pad_token] * pad_len)
        sent_xtoken_ids.extend([xtokenizer.pad_token_id] * pad_len)
        data_rows.extend(list(row)
                         for row in zip(sent_index, sent_token_index, sent_tokens, sent_xtokens, sent_xtoken_ids))
        tq.update(1)
    tq.close()
    return pd.DataFrame(data_rows, columns=['sent_idx', 'token_idx', 'token', 'xtoken', 'xtoken_id'])
Exemple #3
0
def main():
    args = set_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizerFast(vocab_file=args.vocab_path,
                                  sep_token="[SEP]",
                                  pad_token="[PAD]",
                                  cls_token="[CLS]")
    # tokenizer = BertTokenizer(vocab_file=args.voca_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path)
    model = model.to(device)
    model.eval()
    if args.save_samples_path:
        if not os.path.exists(args.save_samples_path):
            os.makedirs(args.save_samples_path)
        samples_file = open(args.save_samples_path + '/samples.txt',
                            'a',
                            encoding='utf8')
        samples_file.write("聊天记录{}:\n".format(datetime.now()))
    # 存储聊天记录,每个utterance以token的id的形式进行存储
    history = []
    print('开始和chatbot聊天,输入CTRL + Z以退出')

    while True:
        try:
            text = input("user:"******"你好"
            if args.save_samples_path:
                samples_file.write("user:{}\n".format(text))
            text_ids = tokenizer.encode(text, add_special_tokens=False)
            history.append(text_ids)
            input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头

            for history_id, history_utr in enumerate(
                    history[-args.max_history_len:]):
                input_ids.extend(history_utr)
                input_ids.append(tokenizer.sep_token_id)
            input_ids = torch.tensor(input_ids).long().to(device)
            input_ids = input_ids.unsqueeze(0)
            response = []  # 根据context,生成的response
            # 最多生成max_len个token
            for _ in range(args.max_len):
                outputs = model(input_ids=input_ids)
                logits = outputs.logits
                next_token_logits = logits[0, -1, :]
                # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                for id in set(response):
                    next_token_logits[id] /= args.repetition_penalty
                next_token_logits = next_token_logits / args.temperature
                # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                next_token_logits[tokenizer.convert_tokens_to_ids(
                    '[UNK]')] = -float('Inf')
                filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                        top_k=args.topk,
                                                        top_p=args.topp)
                # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                next_token = torch.multinomial(F.softmax(filtered_logits,
                                                         dim=-1),
                                               num_samples=1)
                if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                    break
                response.append(next_token.item())
                input_ids = torch.cat((input_ids, next_token.unsqueeze(0)),
                                      dim=1)
                # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist())
                # print("his_text:{}".format(his_text))
            history.append(response)
            text = tokenizer.convert_ids_to_tokens(response)
            print("chatbot:" + "".join(text))
            if args.save_samples_path:
                samples_file.write("chatbot:{}\n".format("".join(text)))
        except KeyboardInterrupt:
            if args.save_samples_path:
                samples_file.close()
            break
Exemple #4
0
def squad_features(
        context: str, question: str, answer: Union[str, None],
        start_char_pos: Union[int, None],
        tokenizer: BertTokenizerFast) -> Tuple[List[int], List[int], int, int]:
    """ Squad feature extractor
    Implement the feature extractor from a Squad sample for your model
    Return values should follow [CLS + question + SEP + context + SEP] form.
    In addition, because start_char_pos is based on character index, you should convert it to proper token index.
    Check the test cases to know the functionality in detail.

    Note: input_ids and token_type_ids follows the transfomer library documentation 
    https://huggingface.co/transformers/glossary.html

    Arguments:
    context -- Context string
    question -- Question string
    anwser -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos
    start_char_pos -- Character index which the answer starts from in the context.
                      If the answer is None, this argument is also None.
    tokenizer -- Tokenizer to encode text strings.
                 Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast

    Returns:
    input_ids -- Input ids
    token_type_ids -- Token type ids 
    start_token_pos -- Token index which the answer starts from in the input_ids list. 
                       None if no answer is given.
    end_token_pos -- Token index which the answer ends by in the input_ids list.
                     This includes the last token which located in the index.
                     None if no answer is given.
    """
    ### YOUR CODE HERE (~18 lines)
    input_ids: List[int] = None
    token_type_ids: List[int] = None
    start_token_pos: int = None
    end_token_pos: int = None

    token_question = tokenizer.tokenize(question)
    #token_context = tokenizer.tokenize(context)

    tokens = ["[CLS]"] + token_question + ["[SEP]"]

    token_type_ids = [0] * len(tokens)

    #set_trace()

    # Answer available
    if start_char_pos is not None:

        token_answer = tokenizer.tokenize(answer)
        back_context_ = tokenizer.tokenize(context[start_char_pos:])

        if _is_whitespace(context[start_char_pos + len(answer)]) is False:
            if back_context_[len(token_answer) - 1] is not token_answer[-1]:
                back_context = tokenizer.tokenize(context[start_char_pos +
                                                          len(answer):])
                back_context[0] = "##" + back_context[0]
            else:
                back_context = back_context_[len(token_answer):]

        else:
            back_context = back_context_[len(token_answer):]

        if start_char_pos == 0:
            front_context = []
            token_answer = tokenizer.tokenize(answer)

        else:
            if _is_whitespace(context[start_char_pos - 1]):
                front_context = tokenizer.tokenize(context[:start_char_pos])

            # if previous chr of answer is not space
            else:

                front_context = tokenizer.tokenize(context[:start_char_pos])
                token_answer[0] = "##" + token_answer[0]

        start_token_pos = len(tokens) + len(front_context)
        end_token_pos = start_token_pos + len(token_answer) - 1

        token_context = front_context + token_answer + back_context
        token_type_ids = token_type_ids + [1] * (len(token_context) + 1)

        tokens = tokens + token_context + ["[SEP]"]

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # No answer case
    else:
        token_context = tokenizer.tokenize(context)
        tokens = tokens + token_context + ["[SEP]"]
        token_type_ids = token_type_ids + [1] * (len(token_context) + 1)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        start_token_pos = None
        end_token_pos = None

        #token_answer = tokenizer.tokenize(answer)
        #if len(token_answer) > 1:

    ### END YOUR CODE

    return input_ids, token_type_ids, start_token_pos, end_token_pos
Exemple #5
0
def squad_features(
        context: str, question: str, answer: Union[str, None],
        start_char_pos: Union[int, None],
        tokenizer: BertTokenizerFast) -> Tuple[List[int], List[int], int, int]:
    """ Squad feature extractor
    Implement the feature extractor from a Squad sample for your model
    Return values should follow [CLS + question + SEP + context + SEP] form.
    In addition, because start_char_pos is based on character index, you should convert it to proper token index.
    Check the test cases to know the functionality in detail.

    Note: input_ids and token_type_ids follows the transfomer library documentation 
    https://huggingface.co/transformers/glossary.html

    Arguments:
    context -- Context string
    question -- Question string
    answer -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos
    start_char_pos -- Character index which the answer starts from in the context.
                      If the answer is None, this argument is also None.
    tokenizer -- Tokenizer to encode text strings.
                 Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast

    Returns:
    input_ids -- Input ids
    token_type_ids -- Token type ids
    start_token_pos -- Token index which the answer starts from in the input_ids list. 
                       None if no answer is given.
    end_token_pos -- Token index which the answer ends by in the input_ids list.
                     This includes the last token which located in the index.
                     None if no answer is given.
    """
    ### YOUR CODE HERE (~18 lines)
    encoded_dict = tokenizer.encode_plus(question, context)
    input_ids = encoded_dict["input_ids"]
    token_type_ids = encoded_dict["token_type_ids"]
    input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # print("Input (tokens): ", input_ids_tokens)
    if answer is None and start_char_pos is None:
        start_token_pos = None
        end_token_pos = None
        return input_ids, token_type_ids, start_token_pos, end_token_pos

    start_token_pos, end_token_pos = 0, 0
    start_token_pos += token_type_ids.count(0)
    start_token_pos += len(tokenizer.tokenize(context[:start_char_pos]))
    end_token_pos += len(tokenizer.tokenize(answer)) + start_token_pos - 1
    # Extract tokenized answer part only
    tokenized_answer = " ".join(
        tokenizer.convert_ids_to_tokens(
            input_ids[start_token_pos:end_token_pos + 1]))

    subword_prefix_original = "##" if "##" in tokenized_answer else ""
    subword_prefix = "##"
    tokenized_answer = tokenized_answer.replace('#', '')
    if tokenized_answer != answer.lower(
    ) and start_token_pos == end_token_pos and answer in tokenized_answer:
        # A single word but different subword tokenization case
        new_subword_list = [
            subword_prefix_original + tokenized_answer[:len(answer)],
            subword_prefix + tokenized_answer[len(answer):]
        ]
        # print('new_subword_list : ', new_subword_list)
        input_ids = input_ids[:
                              start_token_pos] + tokenizer.convert_tokens_to_ids(
                                  new_subword_list) + input_ids[end_token_pos +
                                                                1:]
        token_type_ids.append(1)

    # print("Input ids: ", input_ids)
    # input_ids_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # print("Input (tokens) (ADJUSTED): ", input_ids_tokens)
    # print("Segmend Ids: ", token_type_ids)
    # print('START_CHAR_POS: ', start_char_pos)
    # print("ANSWER: ", answer)
    # print("START: ", start_token_pos)
    # print("END: ", end_token_pos)
    # print("ANSWER SPAN: ", input_ids_tokens[start_token_pos:end_token_pos+1])
    assert len(input_ids) == len(token_type_ids)

    ### END YOUR CODE

    return input_ids, token_type_ids, start_token_pos, end_token_pos
def squad_features(
    context: str,
    question: str,
    answer: Union[str, None],
    start_char_pos: Union[int, None],
    tokenizer: BertTokenizerFast
) -> Tuple[List[int], List[int], int, int]:
    """ Squad feature extractor
    Implement the feature extractor from a Squad sample for your model
    Return values should follow [CLS + question + SEP + context + SEP] form.
    In addition, because start_char_pos is based on character index, you should convert it to proper token index.
    Check the test cases to know the functionality in detail.

    Note: input_ids and token_type_ids follows the transfomer library documentation
    https://huggingface.co/transformers/glossary.html

    Arguments:
    context -- Context string
    question -- Question string
    anwser -- Answer string. If the answer is None, return None for start_token_pos and end_token_pos
    start_char_pos -- Character index which the answer starts from in the context.
                      If the answer is None, this argument is also None.
    tokenizer -- Tokenizer to encode text strings.
                 Explanation: https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast

    Returns:
    input_ids -- Input ids
    token_type_ids -- Token type ids
    start_token_pos -- Token index which the answer starts from in the input_ids list.
                       None if no answer is given.
    end_token_pos -- Token index which the answer ends by in the input_ids list.
                     This includes the last token which located in the index.
                     None if no answer is given.
    """
    input_ids: List[int] = None
    token_type_ids: List[int] = None
    start_token_pos: int = None
    end_token_pos: int = None

    encoded_dict = tokenizer.encode_plus(question, context)
    input_ids = encoded_dict['input_ids']
    token_type_ids = encoded_dict['token_type_ids']
    tokens = tokenizer.tokenize(context)
    words_idx = -1
    num_tokens_before_context = input_ids.index(102) + 1

    try:
        words_idx = tokens.index('##words')
    except:
        words_idx = -1
    if  words_idx != -1:
        tokens[words_idx] = '##word'
        tokens.insert(words_idx + 1, '##s')

        id_word = tokenizer.convert_tokens_to_ids('##word')
        id_s = tokenizer.convert_tokens_to_ids('##s')
        input_ids[num_tokens_before_context + words_idx] = id_word
        input_ids.insert(num_tokens_before_context + words_idx + 1, id_s)
        token_type_ids.insert(num_tokens_before_context + words_idx + 1, 1)



    if answer == None:
        return input_ids, token_type_ids, None, None
    context = context.lower()
    token2char_map = {}
    start = 0
    for j in range(len(tokens)):
        for i in range(len(tokens[j])):
            if tokens[j][i] == '#':
                continue
            else:
                break
        token = tokens[j][i:]
        start = context.find(token,start)
        end = start + len(token)
        token2char_map[j] = [start, end-1]
        start = end

    for i in range(len(tokens)):
        if token2char_map[i][0]>=start_char_pos:
            start_token_pos = i
            break
    end_token_pos = len(tokens) - 1
    for i in range(start_token_pos, len(tokens)):
        if token2char_map[i][0]>=start_char_pos+len(answer):
            end_token_pos = i-1
            break


    start_token_pos += num_tokens_before_context
    end_token_pos += num_tokens_before_context

    return input_ids, token_type_ids, start_token_pos, end_token_pos
def create_from_document(doc_idx, doc, all_docs, max_seq_length,
                         tokenizer: BertTokenizerFast):
    """
    I heavily rely on the implementation of BERT to generate training data:
    github.com/google-research/bert/blob/master/create_pretraining_data.py

    The main differences are:
    - I do not keep short sentences with any probability
    - The masking of tokens will be done dynamically, during training
      (just like the experiment made in RoBERTa paper)

    This function also assumes that all documents are tokenized
    (WordPiece Token Strings).
    """
    instances = []

    # Account for 1x [CLS] and 2x [SEP]
    target_seq_length = max_seq_length - 3

    # We'll use the same strategy as in the original
    # BERT paper, creating instances with a target max length
    # and using segments (groups of sentences) for that
    #
    # We create sentences pairs for next sentence prediction
    # where 50% of times the second sequence is the real next one.
    current_chunk = []
    current_length = 0

    i = 0  # A reference where we stopped in the current doc.

    while i < len(doc):
        segment = doc[i]
        current_chunk.append(segment)
        current_length += len(segment)

        if i == len(doc) - 1 or current_length >= target_seq_length:
            if current_chunk:
                sentence_a = []
                sentence_b = []

                sentence_a_end = randint(1, max(1, len(current_chunk) - 1))

                for ai in range(sentence_a_end):
                    sentence_a.extend(current_chunk[ai])

                is_random_next = False
                chance = random()

                if len(all_docs) > 1 and \
                   (len(current_chunk) == 1 or chance < 0.5):

                    sentence_b_tgt_len = target_seq_length - len(sentence_a)

                    # Let's get a random sentence
                    is_random_next = True
                    random_doc_idx = -1

                    for _ in range(10):
                        random_doc_idx = randint(0, len(all_docs) - 1)

                        if random_doc_idx != doc_idx:
                            break

                    # We select the document and a random position to start
                    # We use len(random_doc) // 2 to make room for a bugger
                    # sentence
                    random_doc = all_docs[random_doc_idx]
                    random_start = randint(0, len(random_doc) // 2)

                    for j in range(random_start, len(random_doc)):
                        sentence_b.extend(random_doc[j])

                        if len(sentence_b) >= sentence_b_tgt_len:
                            break

                    # We free the tokens we'll not use for this instance
                    i -= len(current_chunk) - sentence_a_end
                else:
                    # It will be an actual next sentence
                    for j in range(sentence_a_end, len(current_chunk)):
                        sentence_b.extend(current_chunk[j])

                truncate_seq_pair(sentence_a, sentence_b, target_seq_length)

                assert len(sentence_a) >= 1
                assert len(sentence_b) >= 1

                final_seq = ['[CLS]'] + \
                    sentence_a + \
                    ['[SEP]'] + \
                    sentence_b + \
                    ['[SEP]']

                segment_ids = [0] * (len(sentence_a) + 2)
                segment_ids += [1] * (len(sentence_b) + 1)

                input_ids = tokenizer.convert_tokens_to_ids(final_seq)

                instances.append({
                    'input_ids': input_ids,
                    'token_type_ids': segment_ids,
                    'is_random_next': is_random_next
                })

            current_chunk = []
            current_length = 0
        i += 1
    return instances