class MonoBERT_Numericalise_Transform():
    def __init__(self,
                 vocab_txt_file="saved_models/monoBERT/vocab.txt",
                 **kwargs):
        self.numericalizer = BertTokenizer(vocab_txt_file)

    def __call__(self, samples):
        '''
        sample_obj: [dict]: [{'query':"text and more", 'doc':"doc text" ...}]
        returns: [dict]: [{'input_ids':[34,2,8...], 'type_ids':[0,0,1,1], 'input_text':"text and more", ...}]
        '''
        for sample_obj in samples:
            query_text = sample_obj['query']
            query_ids = [self.numericalizer.cls_token_id
                         ] + self.numericalizer.encode(
                             query_text, add_special_tokens=False)[:62] + [
                                 self.numericalizer.sep_token_id
                             ]
            query_token_type_ids = [0] * len(query_ids)

            doc_text = sample_obj['doc']
            doc_ids = self.numericalizer.encode(
                doc_text, add_special_tokens=False)[:445] + [
                    self.numericalizer.sep_token_id
                ]
            doc_token_type_ids = [1] * len(doc_ids)

            sample_obj["input_ids"] = query_ids + doc_ids
            sample_obj["type_ids"] = query_token_type_ids + doc_token_type_ids
        return samples
def _preprocess(
    data: List[Dict],
    slot_meta: List[str],
    gate_list: Dict[str, int],
    tokenizer: BertTokenizer,
) -> List[Dict]:
    """원 데이터에 Feature 추가

    Args:
        data (List[Dict]): 데이터 원본
        slot_meta (List[str]): Slot 전체 목록 (해당 슬롯 순서로 Feature 생성)
        gate_list (Dict[str, int]): 정의된 Gate 목록
        tokenizer (BertTokenizer): 텍스트를 변환할 Tokenizer

    Returns:
        List[Dict]: 변경된 데이터 (주의: 데이터 원본을 변경시키고 원본 참조를 그대로 반환)
    """
    none_value = tokenizer.encode(
        'none', add_special_tokens=False) + [tokenizer.sep_token_id]
    empty_slot_values = {slot: none_value for slot in slot_meta}
    empty_gates = {slot: gate_list['none'] for slot in slot_meta}

    for example in tqdm(data):
        dialogue: List[Dict] = example['dialogue']
        user_count = 0
        for turn in dialogue:
            turn['text_idx'] = tokenizer.encode(turn['text'])

            slot_values = deepcopy(empty_slot_values)
            gates = deepcopy(empty_gates)

            if turn['role'] == 'user':
                user_count = user_count + 1
                for state_raw in turn['state']:
                    state_raw: List[str] = state_raw.split('-')
                    slot = state_raw[0] + '-' + state_raw[1]
                    value = state_raw[2]

                    slot_values[slot] = tokenizer.encode(
                        value,
                        add_special_tokens=False) + [tokenizer.sep_token_id]
                    gates[slot] = gate_list.get(value, gate_list['ptr'])

                turn['slot'] = slot_values
                turn['gate'] = gates

        example['count'] = user_count

    return data
Exemple #3
0
 def row_to_tensor(tokenizer: BertTokenizer, row: pd.Series) -> Tuple[torch.LongTensor, torch.LongTensor]:
     tokens = tokenizer.encode(row["sent"], add_special_tokens=True)
     if len(tokens) > 120:
         tokens = tokens[:119] + [tokens[-1]]
     x = torch.LongTensor(tokens)
     y = torch.FloatTensor(row[["CAG", "NAG", "OAG"]])
     return x, y
Exemple #4
0
def _tokenize_bert_sentence(text: str, tokenizer: BertTokenizer) -> Tuple:
    """
    Given a sentence and a BertTokenizer, tokenizes the text, maps to
    BERT vocab indices, and make the segment IDs for the tokens before
    returning the tensors on GPU (add flag for GPU disable soon).

    :param text: The sentence being tokenized. A single sentence as str.
    :param tokenizer: The BertTokenizer object instantiated.
    :return token_tensor, segments_tensor: The tensors containing the
                    segment IDs and the tokens themselves. On GPU.
    """
    # Split the sentence into tokens.
    tokenized_text = tokenizer.encode(text, add_special_tokens=True)

    # Map the token strings to their vocabulary indices.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Mark each of the tokens as belonging to sentence "1" (single
    #  sentence).
    segments_ids = [1] * len(tokenized_text)

    # Convert inputs to PyTorch tensors, place on GPU.
    token_tensor = torch.tensor([indexed_tokens]).to('cuda:0')
    segments_tensor = torch.tensor([segments_ids]).to('cuda:0')

    return token_tensor, segments_tensor
Exemple #5
0
    def __init__(self,
                 file_path,
                 tokenizer: BertTokenizer,
                 max_length=512,
                 device=None):
        news_type = []
        news_content = []
        news_atten_mask = []
        seq_typ_ids = []
        with open(file_path, mode='r', encoding='utf8') as f:
            for line in tqdm(f.readlines()):
                line = line.strip()
                line = line.split('\t')

                news_type.append(news_type2id_dict[line[0]])
                token_ids = tokenizer.encode(ILLEGAL_CHARACTERS_RE.sub(
                    r'', line[1]),
                                             max_length=max_length,
                                             pad_to_max_length=True)
                news_content.append(token_ids)
                news_atten_mask.append(get_atten_mask(token_ids))
                seq_typ_ids.append(
                    tokenizer.create_token_type_ids_from_sequences(
                        token_ids_0=token_ids[1:-1]))

        self.label = torch.from_numpy(np.array(news_type)).unsqueeze(1).long()
        self.token_ids = torch.from_numpy(np.array(news_content)).long()
        self.seq_type_ids = torch.from_numpy(np.array(seq_typ_ids)).long()
        self.atten_masks = torch.from_numpy(np.array(news_atten_mask)).long()
        if device is not None:
            self.label = self.label.to(device)
            self.token_ids = self.token_ids.to(device)
            self.seq_type_ids = self.seq_type_ids.to(device)
            self.atten_masks = self.atten_masks.to(device)
Exemple #6
0
class MLMModel:

    def __init__(self):
        self.model: BertForMaskedLM = BertForMaskedLM.from_pretrained(
            pretrained_model_name_or_path='Foodbert/foodbert/data/mlm_output/checkpoint-final')
        with open('Foodbert/foodbert/data/used_ingredients.json', 'r') as f:
            used_ingredients = json.load(f)
        self.tokenizer = BertTokenizer(vocab_file='Foodbert/foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False,
                                       max_len=128, never_split=used_ingredients)

        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model.to(self.device)

    def predict_substitutes(self, sentence, ingredient_name, with_masking=True):
        search_id = self.tokenizer.mask_token_id if with_masking else \
            self.tokenizer.convert_tokens_to_ids([ingredient_name])[0]
        sentence = sentence.replace('!', ' !').replace('?', ' ?').replace('.', ' .').replace(':', ' :').replace(',', ' ,')
        sentence = ' ' + sentence + ' '

        all_ordered_substitutes = []

        masked_sentence = sentence.replace(f' {ingredient_name} ', ' [MASK] ')
        input_ids = torch.tensor(self.tokenizer.encode(masked_sentence, add_special_tokens=True)).unsqueeze(0).to(device=self.device)
        prediction_scores = self.model(input_ids, masked_lm_labels=input_ids)[1][0]
        ingredient_scores = prediction_scores[input_ids[0] == search_id]

        for i in range(len(ingredient_scores)):
            ingredient_score = ingredient_scores[i]
            softmax_scores = ingredient_score.softmax(dim=0)
            indices = torch.sort(ingredient_score, descending=True).indices
            ordered_substitutes = self.tokenizer.convert_ids_to_tokens(indices)
            softmax_scores = softmax_scores[indices].tolist()
            all_ordered_substitutes.append((ordered_substitutes, softmax_scores))

        return all_ordered_substitutes
Exemple #7
0
def find_answer(tokenizer: BertTokenizer,
                answer_model: BertForQuestionAnswering, query: str,
                text: str) -> str:
    with torch.no_grad():
        start, end = answer_model(**tokenizer.encode_plus(
            query, text, max_length=256, truncation=True, return_tensors="pt"))
    start_pos = torch.argmax(start).item()
    end_pos = torch.argmax(end).item()
    if start_pos >= end_pos:
        start = torch.softmax(start, dim=1)
        end = torch.softmax(end, dim=1)
        k = -2
        start_args = torch.argsort(start).tolist()[0]
        end_args = torch.argsort(end).tolist()[0]
        calc_score = lambda start_pos, end_pos: start[0][start_pos] * end[0][
            end_pos]
        s_score, e_score = 0, 0
        s_pos, e_pos = start_pos, end_pos
        while s_score == 0 or e_score == 0:
            s_pos = start_args[k]
            e_pos = end_args[k]
            s_score = 0 if s_pos > end_pos else calc_score(s_pos, end_pos)
            e_score = 0 if e_pos < start_pos else calc_score(start_pos, e_pos)
            k -= 1
        if s_score > e_score:
            start_pos = s_pos
        else:
            end_pos = e_pos
    return tokenizer.decode(tokenizer.encode(query, text)[start_pos:end_pos])
Exemple #8
0
def find_paragraph(tokenizer: BertTokenizer,
                   model: BertForNextSentencePrediction,
                   question: str,
                   context: str,
                   max_len=256,
                   batch_size=16):
    q_len = len(tokenizer.tokenize(question))
    context_tokens = tokenizer.tokenize(context)
    part_len = max_len - q_len - 3
    parts = []
    n = 0
    while n < len(context_tokens):
        parts += [context_tokens[n:n + part_len]]
        n += part_len // 2
    results = []
    all_parts = parts[:]
    while len(parts) > 0:
        batch = tokenizer.batch_encode_plus(list(
            zip([question] * batch_size, parts[:batch_size])),
                                            max_length=max_len,
                                            truncation=True,
                                            pad_to_max_length=True,
                                            return_tensors="pt").to("cuda")
        with torch.no_grad():
            output = model(**batch)[0]
        results += [a - b for a, b in output.cpu().tolist()]
        parts = parts[batch_size:]
    return np.array(results), [
        tokenizer.decode(tokenizer.encode(part), skip_special_tokens=True)
        for part in all_parts
    ]
    def explain_handle(self, model_wraper, text, target=1):  # pylint: disable=too-many-locals,unused-argument,arguments-differ
        """Captum explanations handler.

        Args:
            data_preprocess (Torch Tensor): Preprocessed data to be used for captum
            raw_data (list): The unprocessed data to get target from the request
        Returns:
            dict : A dictionary response with the explanations response.
        """
        model_wrapper = AGNewsmodelWrapper(self.model)
        tokenizer = BertTokenizer(self.vocab_file)
        model_wrapper.eval()
        model_wrapper.zero_grad()
        input_ids = torch.tensor(
            [tokenizer.encode(self.text, add_special_tokens=True)])
        input_embedding_test = model_wrapper.model.bert_model.embeddings(
            input_ids)
        preds = model_wrapper(input_embedding_test)
        out = np.argmax(preds.cpu().detach(), axis=1)
        out = out.item()
        ig_1 = IntegratedGradients(model_wrapper)
        attributions, delta = ig_1.attribute(  # pylint: disable=no-member
            input_embedding_test,
            n_steps=500,
            return_convergence_delta=True,
            target=1,
        )
        tokens = tokenizer.convert_ids_to_tokens(input_ids[0].numpy().tolist())
        feature_imp_dict = {}
        feature_imp_dict["words"] = tokens
        attributions_sum = self.summarize_attributions(attributions)
        feature_imp_dict["importances"] = attributions_sum.tolist()
        feature_imp_dict["delta"] = delta[0].tolist()
        return [feature_imp_dict]
Exemple #10
0
def read_data(filename: str, tokenizer: BertTokenizer,
              args: TrainingArguments) -> List:
    with open(filename, encoding="utf-8") as f:
        data = f.readlines()
    items = [
        item for item in json.loads(data[0])['data']
        if 'russian' in item['paragraphs'][0]['qas'][0]['id']
    ]
    ds = []
    for item in items:
        paragraph = item['paragraphs'][0]
        context = paragraph['context']
        qas = paragraph['qas'][0]
        question = qas['question']
        answer = qas['answers'][0]
        answer_start = answer['answer_start']
        answer_text = answer['text']
        ids = tokenizer.encode(question, context[:answer_start])
        start = len(ids) - 1
        end = start + len(tokenizer.tokenize(answer_text))
        if end < args.block_size:
            ds += [{
                "question": question,
                "context": context,
                "start": start,
                "end": end
            }]
    return ds
def predict(inp: str,
            model: BertForMaskedLM,
            tokenizer: BertTokenizer,
            k: int = 3) -> List[str]:
    """
    Predict the top-k substitutes for an input text containing a single MASK token.
    :param inp: the input text
    :param model: a masked language model
    :param tokenizer: the tokenizer corresponding to the model
    :param k: the number of predictions
    :return: the list of top-k substitutes for the MASK token
    """
    kwargs = {
        'add_prefix_space': True
    } if isinstance(tokenizer, GPT2Tokenizer) else {}
    input_ids = tokenizer.encode(inp, add_special_tokens=True, **kwargs)
    mask_idx = input_ids.index(tokenizer.mask_token_id)
    input_ids = torch.tensor([input_ids])

    with torch.no_grad():
        (predictions, ) = model(input_ids)

    predicted_tokens = []
    _, predicted_indices = torch.topk(predictions[0, mask_idx], k)

    for predicted_index in predicted_indices:
        predicted_token = tokenizer.convert_ids_to_tokens(
            [predicted_index.item()])[0]
        predicted_tokens.append(predicted_token)
    return predicted_tokens
def chat(folder_bert, voc, testing=False):
    tf.random.set_seed(1)
    tokenizer = BertTokenizer(vocab_file=folder_bert + voc)
    if testing:
        tokens = tokenizer.tokenize("jeg tror det skal regne")
        print(tokens)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        print(ids)
        print("Vocab size:", len(tokenizer.vocab))

    config = BertConfig.from_json_file(folder_bert + "/config.json")
    model = BertLMHeadModel.from_pretrained(folder_bert, config=config)
    while (1):
        text = input(">>User: "******"Bot: {}".format(tokenizer.decode(sample_output[0])))
        print("Bot: {}".format(
            tokenizer.decode(sample_output[:, input_ids.shape[-1]:][0],
                             skip_special_tokens=True)))
Exemple #13
0
def get_question_type(tokenizer: BertTokenizer,
                      question_model: BertForSequenceClassification,
                      question: str) -> str:
    input_ids = tokenizer.encode(question, return_tensors="pt")
    with torch.no_grad():
        out = question_model(input_ids)[0]
    a, b = out.tolist()[0]
    return "YESNO" if b > a else "SPAN"
Exemple #14
0
def main():
    args = set_interact_args()
    logger = create_logger(args)
    # 当用户使用GPU,并且GPU可用时
    args.cuda = torch.cuda.is_available() and not args.no_cuda
    # args.cuda = False
    device = 'cuda' if args.cuda else 'cpu'
    logger.info('using device:{}'.format(device))
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    tokenizer = BertTokenizer(vocab_file=args.voca_path)
    model = GPT2LMHeadModel.from_pretrained(args.dialogue_model_path)
    model.to(device)
    model.eval()

    print('***********************Summary model start************************')

    while True:
        try:

            text = input()
            for i in range(5):
                if len(text): text = text[:1000]
                input_ids = [tokenizer.cls_token_id]  # 每个input以[CLS]为开头
                input_ids.extend(tokenizer.encode(text))
                input_ids.append(tokenizer.sep_token_id)
                curr_input_tensor = torch.tensor(input_ids).long().to(device)

                generated = []
                # 最多生成max_len个token
                for _ in range(args.max_len):
                    outputs = model(input_ids=curr_input_tensor)
                    next_token_logits = outputs[0][-1, :]
                    # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率
                    for id in set(generated):
                        next_token_logits[id] /= args.repetition_penalty
                    next_token_logits = next_token_logits / args.temperature
                    # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token
                    next_token_logits[tokenizer.convert_tokens_to_ids(
                        '[UNK]')] = -float('Inf')
                    filtered_logits = top_k_top_p_filtering(next_token_logits,
                                                            top_k=args.topk,
                                                            top_p=args.topp)
                    # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标
                    next_token = torch.multinomial(F.softmax(filtered_logits,
                                                             dim=-1),
                                                   num_samples=1)
                    if next_token == tokenizer.sep_token_id:  # 遇到[SEP]则表明response生成结束
                        break
                    generated.append(next_token.item())
                    curr_input_tensor = torch.cat(
                        (curr_input_tensor, next_token), dim=0)

                text = tokenizer.convert_ids_to_tokens(generated)
                print("summary:" + "".join(text))

        except KeyboardInterrupt:
            break
Exemple #15
0
def find_answer(tokenizer: BertTokenizer, model: BertForQuestionAnswering,
                context: str, question: str):
    input_data = tokenizer.encode_plus(question, context, return_tensors="pt")
    with torch.no_grad():
        out = model(**input_data)
    start, end = out[0], out[1]
    start = torch.argmax(start).item()
    end = torch.argmax(end).item()
    return tokenizer.decode(tokenizer.encode(question, context)[start:end])
Exemple #16
0
def get_decoder_input(tokenizer: BertTokenizer, input_str: list,
                      config: ModelConfig):
    decoder_input = torch.tensor(
        tokenizer.encode(input_str,
                         add_special_tokens=False,
                         max_length=config.max_seq_len,
                         return_tensors='pt',
                         truncation=True))
    return decoder_input
def load_data(file: str, pickle_file: str, tokenizer: BertTokenizer = None):
    if not tokenizer:
        tokenizer = BertTokenizer.from_pretrained(model_name)
    if os.path.exists(pickle_file):
        return pickle.load(open(pickle_file, 'rb'))
    with open(file, 'r', encoding='utf-8') as f:
        all_results, all_labels = [], []
        for line in tqdm(f.readlines()):
            # 1. 加载基础数据
            data = json.loads(line)
            text: str = data['text']

            if len(text) > 512:
                text = text[:512]

            # 2. 对数据进行分词,生成token_ids, mask 以及token_type
            inputs: BatchEncoding = tokenizer(text, return_token_type_ids=True, return_tensors='tf')

            # 3. 构造labels
            sequence_length = tf.shape(inputs['input_ids']).numpy()[-1]
            labels = ['O'] * sequence_length
            input_ids = inputs['input_ids']

            # 4. 开始线性搜索实体数据
            global_index = 0

            # 对所有的实体进行排序
            for entity_obj in sorted(data['entities'], key=lambda x: x['start']):

                entity: str = entity_obj['entity']
                # 4.1 将label数据编码成token_ids,然后在全局进行搜索

                tokenized_output: tf.Tensor = tokenizer.encode(entity, add_special_tokens=False, return_tensors='tf')
                entity_token_length: int = tf.shape(tokenized_output).numpy()[-1]

                # 4.2 全局线性搜索
                for search_index in range(global_index, sequence_length - entity_token_length):

                    # 如果搜索到对应的id
                    if tf.reduce_all(
                            tokenized_output == input_ids[:, search_index: search_index + entity_token_length]
                    ).numpy():
                        labels[search_index] = "B-" + entity_obj['type']
                        for index in range(search_index + 1, search_index + entity_token_length):
                            labels[index] = "I-" + entity_obj['type']
                        # 将下一轮的开始搜索索引设置成上一轮的结束字符
                        global_index = search_index + entity_token_length
                        break
            inputs['labels'] = labels
            all_labels.append(labels)
            all_results.append(inputs)
        pickle.dump(all_results, open(pickle_file, 'wb'))

        # 将labels转化成最后的数据
        labels = tf.data.Dataset()
        return all_results, labels
class SimpleBertEmbeddings(WordEmbeddings):
    tokenizer: BertTokenizer
    model: BertModel
    special_tokens = []

    def __init__(self, bert_model_path: str):
        self.tokenizer = BertTokenizer(vocab_file=bert_model_path +
                                       '/vocab.txt')
        config = BertConfig.from_pretrained(bert_model_path + '/config.json',
                                            output_hidden_states=True)
        self.model = BertModel.from_pretrained(bert_model_path, config=config)
        self.model.eval()

    def convert(self, text: str) -> Dict[Word, List[float]]:
        print("[bert embeddings] analyze text:", text)
        lower_text = text.lower().replace("й",
                                          "и").replace("ё",
                                                       "е").replace("́", "")
        token_ids = self.tokenizer.encode(lower_text)

        encoded_layers = self.model(input_ids=torch.tensor([token_ids]))
        hidden_layers = encoded_layers[2][1:]
        token_embeddings = torch.stack(hidden_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1, 0, 2)
        result: Dict[Word, List[float]] = {}
        text_pos = 0
        prev = None
        for i, token_vec in enumerate(token_embeddings):
            # todo: try only -12 layer: https://github.com/hanxiao/bert-as-service#q-so-which-layer-and-which-pooling-strategy-is-the-best
            # combine last 4 layers (best F1 score)
            cat_vec = torch.cat(
                (token_vec[-1], token_vec[-2], token_vec[-3], token_vec[-4]),
                dim=0)
            if token_ids[i] in self.tokenizer.all_special_ids:
                continue
            token: str = self.tokenizer.convert_ids_to_tokens(token_ids[i])
            if token.startswith("##") and prev is not None:
                clear_token = token.replace("##", "")
                word = Word(prev.text + clear_token, prev.start,
                            prev.end + len(clear_token))
                result.update(
                    {word: np.add(result[prev], cat_vec.tolist()).tolist()})
                del result[prev]
                prev = word
                continue
            start = lower_text.find(token, text_pos)
            if start == -1:
                continue
            end = start + len(token)
            word = Word(token, start, end)
            text_pos = end
            prev = word
            result.update({word: cat_vec.tolist()})
        return result
def make_predictions(
        sentence_array: np.array, model: BertForSequenceClassification,
        tokenizer: BertTokenizer, device: torch.device,
        hyperparameter_dict: dict) -> typing.Tuple[np.array, np.array]:
    """
    Make predictions on DataFrame containing sentences with given model

    :param model: Torch model
    :param tokenizer: BERT-base tokenizer
    :param device: Torch device
    :param max_length: Max length of input sequence (for padding)
    :param hyperparameter_dict: Dictionary of model hyperparameters
    :return: NumPy array of label predictions
    """
    # Prepare data
    encoded_sentences = []

    for sentence in sentence_array:
        enc_sent_as_list = tokenizer.encode(sentence, add_special_tokens=True)
        encoded_sentences.append(enc_sent_as_list)

    input_array, input_attention_mask_array = _create_sentence_input_arrays(
        encoded_sentences, hyperparameter_dict['max_length'])

    input_tensor = torch.tensor(input_array)
    input_attention_mask_tensor = torch.tensor(input_attention_mask_array)

    input_dataset = TensorDataset(input_tensor, input_attention_mask_tensor)

    input_data_loader = DataLoader(
        input_dataset, batch_size=hyperparameter_dict['batch_size'])

    # Run model

    model.eval()

    logit_list = []

    for batch in input_data_loader:

        batch_input_ids = batch[0].to(device)
        batch_attention_mask = batch[1].to(device)

        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids,
                            token_type_ids=None,
                            attention_mask=batch_attention_mask)

        logits = outputs[0]
        logit_list.append(logits)

    logits_tensor = torch.cat(logit_list, dim=0)
    prob_tensor = torch.softmax(logits_tensor, dim=1)

    return np.array(logits_tensor.cpu()), np.array(prob_tensor.cpu())
Exemple #20
0
def get_encoder_input(tokenizer: BertTokenizer, input_str: list,
                      config: ModelConfig):
    encoder_input = torch.tensor(
        tokenizer.encode(input_str,
                         add_special_tokens=False,
                         pad_to_max_length=True,
                         max_length=config.max_seq_len,
                         return_tensors='pt',
                         truncation=True))
    encoder_input_mask = encoder_input != 0
    return encoder_input, encoder_input_mask
Exemple #21
0
    def row_to_tensor(
            tokenizer: BertTokenizer,
            row: pd.Series) -> Tuple[torch.LongTensor, torch.LongTensor]:

        tokens = tokenizer.encode(row["text"], add_special_tokens=True)
        if len(tokens) > config.MAX_LEN:
            tokens = tokens[:config.MAX_LEN - 1] + [tokens[-1]]
        x = torch.LongTensor(tokens)
        y = torch.FloatTensor(
            row[["Depression", "Alcohol", "Suicide", "Drugs"]])
        return x, y
Exemple #22
0
def generate_samples(
    model: PreTrainedModel,
    tokenizer: BertTokenizer,
    prompt_text: str,
    max_length=args['max_length'],
    temperature=args['temperature'],
    top_k=args['k'],
    top_p=args['p'],
    repetition_penalty=args['repetition_penalty'],
    num_return_sequences=args['num_return_sequences'],
    stop_token=args['stop']
    ):

    encoded_prompt=tokenizer.encode(prompt_text, add_special_tokens=True, return_tensors='pt')
    encoded_prompt=encoded_prompt.to(model.device)
    input_ids=encoded_prompt if encoded_prompt.shape[-1]>0 else None

    output_sequences = model.generate(
        input_ids=input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        num_return_sequences=num_return_sequences,
    )

    if len(output_sequences.shape) > 2:
        output_sequences.squeeze_()

    generated_sequences = []

    for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
        print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
        generated_sequence = generated_sequence.tolist()

        # Decode text
        text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)

        # Remove all text after the stop token
        text = text[: text.find(stop_token) if args['stop_token'] else None]

        # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
        total_sequence = (
            prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)) :]
        )

        generated_sequences.append(total_sequence)
        print(total_sequence)

    return generated_sequences
Exemple #23
0
def bert_collate_fn(tokenizer: BertTokenizer, max_len: int,
                    pairs: Iterable[Tuple[str, int]]):
    pairs = [(text.split()[:max_len], label) for text, label in pairs]
    texts, labels = zip(*pairs)
    labels = torch.LongTensor(labels)
    # +1 for [CLS] token
    text_lens = torch.LongTensor([len(text) + 1 for text in texts])
    max_len = text_lens.max().item()
    ids = torch.ones(len(texts), max_len).long() * tokenizer.pad_token_id
    for i, text in enumerate(texts):
        ids[i][:len(text) + 1] = torch.LongTensor(
            tokenizer.encode(text, add_special_tokens=True)[:-1])
    return ids, text_lens, labels
Exemple #24
0
    def __init__(self,
                 file_path,
                 ent_path,
                 tokenizer: BertTokenizer,
                 batch_size=32,
                 max_len=80):
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_examples_per_record = 10  # 每个数据生成的负类个数
        # 读取实体
        with open(ent_path, "r", encoding="utf8") as fr:
            self.ents = json.load(fr)
        # 实体信息变为字符串
        self.ent_id2str = None
        self.ent_to_string()
        self.all_ent_id = list(self.ents.keys())
        # 训练数据
        self.titles = []
        with open(file_path, "r", encoding="utf8") as fr:
            for line in fr:
                ss = line.strip().split("\t")
                self.titles.append(ss)
        # 把title和训练数据都变成topken_id
        for i in self.titles:
            i.append(tokenizer.encode(i[0]))
        # 因为是拼接不需要[CLS]
        self.ent_id2token_id = {
            k: tokenizer.encode(v[0:self.max_len])[1:]
            for k, v in self.ent_id2str.items()
        }

        self.pos_input_ids, self.pos_attention_mask, self.pos_token_type_ids = None, None, None
        self.neg_input_ids, self.neg_attention_mask, self.neg_token_type_ids = None, None, None
        self.idx = None
        self.start, self.end = -1, -1
        # 计算 每一轮会迭代的steps
        self.steps = len(
            self.titles) * self.num_examples_per_record // self.batch_size
Exemple #25
0
def predict_fn(input_data, model):
    vocab_path = '/opt/ml/model/vocab.txt'
    tokenizer = BertTokenizer(vocab_path, do_lower_case=True)

    question, context = input_data['question'], input_data['context']

    input_ids = tokenizer.encode(question, context)
    token_type_ids = [
        0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))
    ]
    start_scores, end_scores = model(torch.tensor([input_ids]),
                                     token_type_ids=torch.tensor(
                                         [token_type_ids]))
    all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = ' '.join(
        all_tokens[torch.argmax(start_scores):torch.argmax(end_scores) + 1])
    return answer
def prediction(sentence):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    config = BertConfig.from_json_file('model/bert_config.json')
    model = BERTClassifierModel(config).to(device)

    model.load_state_dict(torch.load('model/model_bert.bin'))
    tokenizer = BertTokenizer('model/vocab.txt')

    input_id = torch.tensor(
        tokenizer.encode(sentence, add_special_tokens=True, truncation=True, max_length=256, pad_to_max_length=True))
    attention_mask = torch.tensor([float(i > 0) for i in input_id])

    input_id = input_id.unsqueeze(0).to(device)
    attention_mask = attention_mask.unsqueeze(0).to(device)

    result = model((input_id, attention_mask)).detach()

    return result
Exemple #27
0
def turn2example(
    tokenizer: BertTokenizer,
    domain: str,
    slot: str,
    value: str,
    context_ids: List[int],
    triple_labels: Optional[Set[tuple]] = None,
    belief_state: Optional[List[Tuple[str, str, str]]] = None,
    dial_id: str = None,
    turn_id: int = None,
) -> tuple:
    """Convert turn data to example based on ontology.

    Args:
        tokenizer: BertTokenizer, see https://huggingface.co/transformers/model_doc/bert.html#berttokenizer
        domain: domain of current example
        slot: slot of current example
        value: value of current example
        context_ids: context token's id in bert vocab
        triple_labels: set of (domain, slot, value)
        belief_state: list of (domain, slot, value)
        dial_id: current dialogue id
        turn_id: current turn id

    Returns:
        example, (input_ids, token_type_ids, domain, slot, value, ...)
    """
    candidate = domain + "-" + slot + " = " + value
    candidate_ids = tokenizer.encode(candidate, add_special_tokens=False)
    input_ids = (
        [tokenizer.cls_token_id]
        + context_ids
        + [tokenizer.sep_token_id]
        + candidate_ids
        + [tokenizer.sep_token_id]
    )
    token_type_ids = [0] + [0] * len(context_ids) + [0] + [1] * len(candidate_ids) + [1]
    example = (input_ids, token_type_ids, domain, slot, value)
    if dial_id is not None:
        label = int((domain, slot, value) in triple_labels)
        example += (belief_state, label, dial_id, str(turn_id))
    return example
Exemple #28
0
 def row_to_tensor(
     tokenizer: BertTokenizer, row: pd.Series
 ) -> Tuple[torch.LongTensor, torch.LongTensor]:
     tokens = tokenizer.encode(row["comment_text"], add_special_tokens=True)
     if len(tokens) > 120:
         tokens = tokens[:119] + [tokens[-1]]
     x = torch.LongTensor(tokens)
     y = torch.FloatTensor(
         row[
             [
                 "toxic",
                 "severe_toxic",
                 "obscene",
                 "threat",
                 "insult",
                 "identity_hate",
             ]
         ]
     )
     return x, y
def word_embedding_tf_2d(tokenizer: BertTokenizer, model: TFBertModel, text: str) -> np.ndarray:
    """
    get the vector representation for the sentence
    :param tokenizer:
    :param model:
    :param text:
    :return:
    """
    # tokenize and get the input ids for the tokens
    input_ids = tf.constant(tokenizer.encode(text))[None, :]
    # get the embedding vectors
    outputs = model(input_ids)

    hidden_states = outputs[0]

    # calculate the mean on axis 0 to reduce the vector to 1D
    output_vector = np.mean(hidden_states[0], axis=0)
    # print(output_vector.shape)

    return output_vector
Exemple #30
0
 def build_bert_input(text_data, max_text_length=150):
     tokens, segments, input_masks = [], [], []
     tokenizer = BertTokenizer(
         vocab_file=os.path.join(bert_path, "vocab.txt")
     )  # 初始化分词器
     for text in text_data:
         indexed_tokens = tokenizer.encode(text)  # 索引列表
         if len(indexed_tokens) > max_text_length:
             indexed_tokens = (
                 indexed_tokens[: max_text_length // 2]
                 + indexed_tokens[-max_text_length // 2 :]
             )
         tokens.append(indexed_tokens)
         segments.append([0] * len(indexed_tokens))
         input_masks.append([1] * len(indexed_tokens))
     for j in range(len(tokens)):
         padding = [0] * (max_text_length - len(tokens[j]))
         tokens[j] += padding
         segments[j] += padding
         input_masks[j] += padding
     return tokens, segments, input_masks