Exemple #1
0
def collate_fn(batch_data):
    tokenizer = BertTokenizer('./data/bert/nezha-base-www/vocab.txt')
    max_len = max([len(x[0]) for x in batch_data]) + 2
    input_ids, token_type_ids, attention_mask, labels = [], [], [], []
    for text, label in batch_data:
        inputs = tokenizer.encode_plus(text=text,
                                       max_length=max_len,
                                       pad_to_max_length=True,
                                       is_pretokenized=True,
                                       return_token_type_ids=True,
                                       return_attention_mask=True,
                                       truncation=True)
        label = tokenizer.encode_plus(text=label,
                                      max_length=max_len,
                                      pad_to_max_length=True,
                                      is_pretokenized=True,
                                      return_token_type_ids=False,
                                      return_attention_mask=False,
                                      truncation=True)
        input_ids.append(inputs['input_ids'])
        token_type_ids.append(inputs['token_type_ids'])
        attention_mask.append(inputs['attention_mask'])
        labels.append(label['input_ids'])
    input_ids = torch.tensor(input_ids).long()
    token_type_ids = torch.tensor(token_type_ids).long()
    attention_mask = torch.tensor(attention_mask).float()
    labels = torch.tensor(labels).long()
    return input_ids, token_type_ids, attention_mask, labels
Exemple #2
0
def _iter_row(df, inputs: dict, task: str, tokenizer: BertTokenizer, train: bool,
              train_val_split_ratio: float, label2id: Optional[dict] = None) -> Tuple[dict, bool]:
    targets = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f'Preprocess {task}'):
        text_a = row[1]
        if task == 'ocnli':
            target_idx = 3
            text_b = row[2]
            output_ids = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True,
                                               return_token_type_ids=True, return_attention_mask=True)
        else:
            target_idx = 2
            output_ids = tokenizer.encode_plus(text_a, add_special_tokens=True,
                                               return_token_type_ids=True, return_attention_mask=True)
        inputs['input_ids'].append(output_ids['input_ids'])
        inputs['token_type_ids'].append(output_ids['token_type_ids'])
        inputs['attention_mask'].append(output_ids['attention_mask'])

        if train_val_split_ratio is not None:
            targets.append(row[target_idx])
        else:
            targets.append(list(label2id.keys())[0])
    targets_series = pd.Series(targets)
    if label2id is None:
        train = True
        targets, label2id = convert_label_to_id(targets_series)
    else:
        targets, = convert_label_to_id(targets_series, label2id)
    inputs['targets'] = targets
    return label2id, train
Exemple #3
0
def convert_input_example(example: InputExample, tokenizer: BertTokenizer,
                          max_seq_len):
    set_type = example.set_type
    text = example.text
    label = example.label

    encode_dict = tokenizer.encode_plus(text=text,
                                        max_length=max_seq_len,
                                        pad_to_max_length=True,
                                        return_token_type_ids=True,
                                        return_attention_mask=True,
                                        truncation=True,
                                        padding=True)

    token_ids = encode_dict['input_ids']
    attention_masks = encode_dict['attention_mask']
    token_type_ids = encode_dict['token_type_ids']

    out_len = len(encode_dict['input_ids'])
    pad_len = max_seq_len - out_len

    token_ids = encode_dict['input_ids'] + [0] * pad_len
    attention_masks = encode_dict['attention_mask'] + [0] * pad_len
    token_type_ids = encode_dict['token_type_ids'] + [0] * pad_len

    feature = BertFeature(
        # bert inputs
        token_ids=token_ids,
        attention_masks=attention_masks,
        token_type_ids=token_type_ids,
    )

    return feature
    def trans_title_ent_to_bert_ipt(title: str, ent_info,
                                    tokenizer: BertTokenizer,
                                    attr2max_len: Dict, used_attrs: List[str]):
        """

        :param title:
        :param ent_info:
        :param tokenizer:
        :param attr2max_len:
        :param used_attrs:
        :return:Dict[attr_name:{"input_ids":..}]
        """
        ipt = {}
        for attr_name in used_attrs:
            attr_value = ent_info[attr_name]
            ##########################################################################################################
            if not DataUtil.is_null(ent_info["company"]):
                attr_value = ent_info["company"] + "。" + attr_value
            if not DataUtil.is_null(ent_info["place"]):
                attr_value = ent_info["place"] + "。" + attr_value
            #########################################################################################################
            ipt[attr_name] = tokenizer.encode_plus(
                title,
                attr_value,
                max_length=attr2max_len[attr_name],
                padding="max_length",
                truncation=True,
                return_tensors="pt")
        return ipt
Exemple #5
0
def find_answer(tokenizer: BertTokenizer,
                answer_model: BertForQuestionAnswering, query: str,
                text: str) -> str:
    with torch.no_grad():
        start, end = answer_model(**tokenizer.encode_plus(
            query, text, max_length=256, truncation=True, return_tensors="pt"))
    start_pos = torch.argmax(start).item()
    end_pos = torch.argmax(end).item()
    if start_pos >= end_pos:
        start = torch.softmax(start, dim=1)
        end = torch.softmax(end, dim=1)
        k = -2
        start_args = torch.argsort(start).tolist()[0]
        end_args = torch.argsort(end).tolist()[0]
        calc_score = lambda start_pos, end_pos: start[0][start_pos] * end[0][
            end_pos]
        s_score, e_score = 0, 0
        s_pos, e_pos = start_pos, end_pos
        while s_score == 0 or e_score == 0:
            s_pos = start_args[k]
            e_pos = end_args[k]
            s_score = 0 if s_pos > end_pos else calc_score(s_pos, end_pos)
            e_score = 0 if e_pos < start_pos else calc_score(start_pos, e_pos)
            k -= 1
        if s_score > e_score:
            start_pos = s_pos
        else:
            end_pos = e_pos
    return tokenizer.decode(tokenizer.encode(query, text)[start_pos:end_pos])
Exemple #6
0
    def preprocess(self, data):
        """
        Receives text in form of json and converts it into an encoding for the inference stage

        :param data: Input to be passed through the layers for prediction

        :return: output - preprocessed encoding
        """

        text = data[0].get("data")
        if text is None:
            text = data[0].get("body")

        text = text.decode("utf-8")

        tokenizer = BertTokenizer(
            self.VOCAB_FILE)  # .from_pretrained("bert-base-cased")
        encoding = tokenizer.encode_plus(
            text,
            max_length=32,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",  # Return PyTorch tensors
            truncation=True,
        )

        return encoding
def read_data(config: dict, tokenizer: BertTokenizer, debug=False) -> str:
    train_file_path = os.path.join(
        '../tcdata/nlp_round2_data/pretrain_data.tsv')
    test_file_path = os.path.join(
        '../tcdata/nlp_round1_data/gaiic_track3_round1_testB_20210317.tsv')
    train_df = pd.read_csv(train_file_path, header=None, sep='\t')
    test_df = pd.read_csv(test_file_path, header=None, sep='\t')
    if debug:
        train_df = train_df.head(1000)
        test_df = test_df.head(1000)

    data_df = {'train': train_df, 'test': test_df}
    processed_data = {}

    for data_type, df in data_df.items():
        inputs = defaultdict(list)
        for i, row in tqdm(df.iterrows(),
                           desc=f'Preprocessing {data_type} data',
                           total=len(df)):
            label = 0 if data_type == 'test' else row[2]
            sentence_a, sentence_b = row[0], row[1]
            inputs_dict = tokenizer.encode_plus(sentence_a,
                                                sentence_b,
                                                add_special_tokens=True,
                                                return_token_type_ids=True,
                                                return_attention_mask=True)
            inputs['input_ids'].append(inputs_dict['input_ids'])
            inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
            inputs['attention_mask'].append(inputs_dict['attention_mask'])
            inputs['labels'].append(label)

        processed_data[data_type] = inputs

    return processed_data
def preprocess_for_finbert(data, vocab_file, max_length=MAX_SEQ_LENGTH):
    tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=True)

    input_ids = []
    token_type_ids = []
    attention_masks = []

    for sent in data:
        encoded_sent = tokenizer.encode_plus(text=sent,
                                             add_special_tokens=True,
                                             max_length=max_length,
                                             truncation=True,
                                             padding='max_length',
                                             return_token_type_ids=True,
                                             return_attention_mask=True)

        input_ids.append(encoded_sent.get('input_ids'))
        token_type_ids.append(encoded_sent.get('token_type_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    token_type_ids = torch.tensor(token_type_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, token_type_ids, attention_masks
Exemple #9
0
    def get_single_loader_from_raw_text(src_lines, trg_lines,
                                        tokenizer: BertTokenizer, batch_size):

        tensor_datasets = {'train': [], 'valid': []}
        input_ids = []
        token_type_ids = []
        attention_mask = []
        label_list = []

        assert len(src_lines) == len(trg_lines)

        for ndx, row in enumerate(src_lines):
            if ndx % 100 == 0:
                logger.info('%d/%d processed.' % (ndx, len(src_lines)))

            src = src_lines[ndx]
            trg = trg_lines[ndx]

            output = tokenizer.encode_plus(src,
                                           max_length=512,
                                           pad_to_max_length=True)
            input_ids.append(output['input_ids'])
            # token_type_ids.append(output['token_type_ids'])
            attention_mask.append(output['attention_mask'])

            labels_encode = tokenizer.encode_plus(trg,
                                                  max_length=512,
                                                  pad_to_max_length=True)
            label_list.append(labels_encode['input_ids'])

        data_set = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label_list
        }
        train_max_num = int(len(data_set['input_ids']))

        for name in ['input_ids', 'attention_mask', 'labels']:
            tensor_datasets['train'].append(
                torch.LongTensor(data_set[name][0:train_max_num]))

        train_data_set = TensorDataset(*tensor_datasets['train'])
        train_data_loader = DataLoader(train_data_set,
                                       batch_size=batch_size,
                                       shuffle=True)

        return train_data_loader
Exemple #10
0
def convert_attribution_example(ex_idx, example: AttributionExample,
                                max_seq_len, tokenizer: BertTokenizer,
                                polarity2id, tense2id):
    """
    convert attribution example to attribution feature
    """
    set_type = example.set_type
    raw_text = example.text
    raw_label = example.label
    trigger = example.trigger

    tokens = fine_grade_tokenize(raw_text, tokenizer)

    trigger_loc = (trigger[1] + 1, trigger[1] + len(trigger[0]))

    labels = [tense2id[raw_label[0]], polarity2id[raw_label[1]]]

    encode_dict = tokenizer.encode_plus(text=tokens,
                                        max_length=max_seq_len,
                                        pad_to_max_length=True,
                                        is_pretokenized=True,
                                        return_token_type_ids=True,
                                        return_attention_mask=True)

    token_ids = encode_dict['input_ids']
    attention_masks = encode_dict['attention_mask']
    token_type_ids = encode_dict['token_type_ids']

    window_size = 20

    # 左右各取 20 的窗口作为 trigger 触发的语境
    pooling_masks_range = range(
        max(1, trigger_loc[0] - window_size),
        min(min(1 + len(raw_text), max_seq_len - 1),
            trigger_loc[1] + window_size))

    pooling_masks = [0] * max_seq_len
    for i in pooling_masks_range:
        pooling_masks[i] = 1
    for i in range(trigger_loc[0], trigger_loc[1] + 1):
        pooling_masks[i] = 0

    if ex_idx < 3 and set_type == 'train':
        logger.info(f"*** {set_type}_example-{ex_idx} ***")
        logger.info(f'text: {" ".join(tokens)}')
        logger.info(f"token_ids: {token_ids}")
        logger.info(f"attention_masks: {attention_masks}")
        logger.info(f"token_type_ids: {token_type_ids}")
        logger.info(f'trigger loc: {trigger_loc}')
        logger.info(f'labels: {labels}')

    feature = AttributionFeature(token_ids=token_ids,
                                 attention_masks=attention_masks,
                                 token_type_ids=token_type_ids,
                                 trigger_loc=trigger_loc,
                                 pooling_masks=pooling_masks,
                                 labels=labels)

    return feature
Exemple #11
0
def find_answer(tokenizer: BertTokenizer, model: BertForQuestionAnswering,
                context: str, question: str):
    input_data = tokenizer.encode_plus(question, context, return_tensors="pt")
    with torch.no_grad():
        out = model(**input_data)
    start, end = out[0], out[1]
    start = torch.argmax(start).item()
    end = torch.argmax(end).item()
    return tokenizer.decode(tokenizer.encode(question, context)[start:end])
def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict:

    choices_features = []

    for key, option in x["options"].items():
        text_a = x["stem"]
        text_b = option
        inputs = tokenizer.encode_plus(
            text_a,
            text_b,
            add_special_tokens=True,
            max_length=MAX_LEN,
        )
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_id] * padding_length)

        assert len(
            input_ids) == MAX_LEN, "Error with input length {} vs {}".format(
                len(input_ids), MAX_LEN)
        assert len(attention_mask
                   ) == MAX_LEN, "Error with input length {} vs {}".format(
                       len(attention_mask), MAX_LEN)
        assert len(token_type_ids
                   ) == MAX_LEN, "Error with input length {} vs {}".format(
                       len(token_type_ids), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        })

    label = label_map.get(x["answer_key"], -1)
    label = torch.tensor(label).long()

    return {
        "id":
        x["id"],
        "label":
        label,
        "input_ids":
        torch.tensor([cf["input_ids"] for cf in choices_features]),
        "attention_mask":
        torch.tensor([cf["attention_mask"] for cf in choices_features]),
        "token_type_ids":
        torch.tensor([cf["token_type_ids"] for cf in choices_features]),
    }
Exemple #13
0
def find_yesno_answer(tokenizer: BertTokenizer,
                      question_model: BertForSequenceClassification,
                      question: str, text: str) -> str:
    input_ids = tokenizer.encode_plus(question, text, return_tensors="pt")
    with torch.no_grad():
        out = question_model(**input_ids)[0]
    no, yes, none = torch.softmax(out, dim=1).tolist()[0]
    if none > 0.5:
        return "не знаю"
    elif no > yes:
        return "нет"
    else:
        return "да"
Exemple #14
0
def tokenize(tokenizer: BertTokenizer, sentence: str, max_len: int = 50):
    # For single sequences:
    #  tokens:   [CLS] I am a boy . [SEP]
    #  type_ids: 0   0   0   0  0     0 0

    tokens = tokenizer.encode_plus(sentence,
                                   max_length=max_len,
                                   pad_to_max_length=True)
    assert len(tokens['input_ids']) == max_len
    assert len(tokens['token_type_ids']) == max_len
    assert len(tokens['attention_mask']) == max_len

    return tokens
 def _read_data(self, pretrain_csv_path, tokenizer: BertTokenizer) -> dict:
     pretrain_df = pd.read_csv(pretrain_csv_path, header=None, sep='\t')
     inputs = defaultdict(list)
     for i, row in tqdm(pretrain_df.iterrows(),
                        desc='',
                        total=len(pretrain_df)):
         sentence = row[0].strip()
         sentence = re.sub(r"[%s]+" % punctuation, '[SEP]', sentence)
         inputs_dict = tokenizer.encode_plus(sentence,
                                             add_special_tokens=True,
                                             return_token_type_ids=True,
                                             return_attention_mask=True)
         inputs['input_ids'].append(inputs_dict['input_ids'])
         inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
         inputs['attention_mask'].append(inputs_dict['attention_mask'])
     return inputs
Exemple #16
0
def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        if x["question"].find("_") != -1:
            text_b = x["question"].replace("_", option)
        else:
            text_b = x["question"] + " " + option

        inputs = tokenizer.encode_plus(
                text_a,
                text_b,
                add_special_tokens=True,
                max_length=MAX_LEN
                )
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_mask = [1] * len(input_ids)

        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_id] * padding_length)

        assert len(input_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(input_ids), MAX_LEN)
        assert len(attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(len(attention_mask), MAX_LEN)
        assert len(token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(len(token_type_ids), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            })

    labels = label_map.get(x["answer"], -1)
    label = torch.tensor(labels).long()

    return {
            "id": x["id"],
            "label": label,
            "input_ids": torch.tensor([cf["input_ids"] for cf in choices_features]),
            "attention_mask": torch.tensor([cf["attention_mask"] for cf in choices_features]),
            "token_type_ids": torch.tensor([cf["token_type_ids"] for cf in choices_features]),
            }
def read_data(train_file_path, tokenizer: BertTokenizer) -> dict:
    train_data = open(train_file_path, 'r', encoding='utf8').readlines()

    inputs = defaultdict(list)
    for row in tqdm(train_data,
                    desc=f'Preprocessing train data',
                    total=len(train_data)):
        sentence = row.strip()
        inputs_dict = tokenizer.encode_plus(sentence,
                                            add_special_tokens=True,
                                            return_token_type_ids=True,
                                            return_attention_mask=True)
        inputs['input_ids'].append(inputs_dict['input_ids'])
        inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
        inputs['attention_mask'].append(inputs_dict['attention_mask'])

    return inputs
def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict:
    # Given two sentences, x["string1"] and x["string2"], this function returns BERT ready inputs.
    inputs = tokenizer.encode_plus(
        x["string1"],
        x["string2"],
        add_special_tokens=True,
        max_length=MAX_LEN,
    )

    # First `input_ids` is a sequence of id-type representation of input string.
    # Second `token_type_ids` is sequence identifier to show model the span of "string1" and "string2" individually.
    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
    attention_mask = [1] * len(input_ids)

    # BERT requires sequences in the same batch to have same length, so let's pad!
    padding_length = MAX_LEN - len(input_ids)

    pad_id = tokenizer.pad_token_id
    input_ids = input_ids + ([pad_id] * padding_length)
    attention_mask = attention_mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([pad_id] * padding_length)

    # Super simple validation.
    assert len(
        input_ids) == MAX_LEN, "Error with input length {} vs {}".format(
            len(input_ids), MAX_LEN)
    assert len(
        attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(
            len(attention_mask), MAX_LEN)
    assert len(
        token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(
            len(token_type_ids), MAX_LEN)

    # Convert them into PyTorch format.
    label = torch.tensor(int(x["quality"])).long()
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    token_type_ids = torch.tensor(token_type_ids)

    # DONE!
    return {
        "label": label,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids
    }
Exemple #19
0
 def explain_handle(self, model_wraper, text, target=1):
     """Captum explanations handler
     Args:
         data_preprocess (Torch Tensor):
         Preprocessed data to be used for captum
         raw_data (list): The unprocessed data to get target from the request
     Returns:
         dict : A dictionary response with the explanations response.
     """
     vis_data_records_base = []
     model_wrapper = AGNewsmodelWrapper(self.model)
     tokenizer = BertTokenizer(self.VOCAB_FILE)
     model_wrapper.eval()
     model_wrapper.zero_grad()
     encoding = tokenizer.encode_plus(self.text,
                                      return_attention_mask=True,
                                      return_tensors="pt",
                                      add_special_tokens=False)
     input_ids = encoding["input_ids"]
     attention_mask = encoding["attention_mask"]
     input_ids = input_ids.to(self.device)
     attention_mask = attention_mask.to(self.device)
     input_embedding_test = model_wrapper.model.bert_model.embeddings(
         input_ids)
     preds = model_wrapper(input_embedding_test, attention_mask)
     out = np.argmax(preds.cpu().detach(), axis=1)
     out = out.item()
     ig_1 = IntegratedGradients(model_wrapper)
     attributions, delta = ig_1.attribute(  # pylint: disable=no-member
         input_embedding_test,
         n_steps=500,
         return_convergence_delta=True,
         target=1,
     )
     tokens = tokenizer.convert_ids_to_tokens(
         input_ids[0].cpu().numpy().tolist())
     feature_imp_dict = {}
     feature_imp_dict["words"] = tokens
     attributions_sum = self.summarize_attributions(attributions)
     feature_imp_dict["importances"] = attributions_sum.tolist()
     feature_imp_dict["delta"] = delta[0].tolist()
     self.add_attributions_to_visualizer(attributions, tokens,
                                         self.score_func(preds), out, 2, 1,
                                         delta, vis_data_records_base)
     return [feature_imp_dict]
Exemple #20
0
def read_data(train_file_path, tokenizer: BertTokenizer) -> dict:
    # train_df = pd.read_csv(train_file_path, header=None, sep='\t')
    train_df = pd.read_csv(train_file_path, header=None)

    inputs = defaultdict(list)
    for i, row in tqdm(train_df.iterrows(),
                       desc=f'Preprocessing train data',
                       total=len(train_df)):
        # sentence_a, sentence_b = row[0], row[1]
        sentence = row[0]
        inputs_dict = tokenizer.encode_plus(sentence,
                                            add_special_tokens=True,
                                            return_token_type_ids=True,
                                            return_attention_mask=True)
        inputs['input_ids'].append(inputs_dict['input_ids'])
        inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
        inputs['attention_mask'].append(inputs_dict['attention_mask'])

    return inputs
Exemple #21
0
def read_data(train_file_path, tokenizer: BertTokenizer, debug=False) -> dict:
    train_data = pd.read_csv(train_file_path, header=None, sep='\t')
    if debug:
        train_data = train_data.head(1000)
    data_dict = defaultdict(list)
    for i, row in tqdm(train_data.iterrows(),
                       desc=f'Preprocessing train data',
                       total=train_data.shape[0]):
        text_a, text_b = row[0], row[1]
        inputs_dict = tokenizer.encode_plus(text_a,
                                            text_b,
                                            add_special_tokens=True,
                                            return_token_type_ids=True,
                                            return_attention_mask=True)
        data_dict['input_ids'].append(inputs_dict['input_ids'])
        data_dict['token_type_ids'].append(inputs_dict['token_type_ids'])
        data_dict['attention_mask'].append(inputs_dict['attention_mask'])

    return data_dict
Exemple #22
0
class DS(Dataset):
    def __init__(self, lines, vocab_path="vocab/vocab.txt", max_length=1024):
        self.data = lines
        self.tok = BertTokenizer(vocab_file=vocab_path)
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        line = self.data[index]
        line = self.tok.encode_plus(
            line,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return line
class BertVector:
    def __init__(self):
        self.tokenizer = BertTokenizer(vocab_file)
        self.model = car_aq_model()

    def encode(self, sentence, max_sentence_len):
        bert_input = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_sentence_len,
            padding='max_length',
            # return_attention_mask=True
        )
        input_ids = tf.convert_to_tensor([bert_input['input_ids']])
        token_type_ids = tf.convert_to_tensor([bert_input['token_type_ids']])
        attention_mask = tf.convert_to_tensor([bert_input['attention_mask']])
        outputs = self.model(input_ids,token_type_ids,attention_mask)

        return outputs.numpy()
Exemple #24
0
def get_features(text: str, tokenizer: BertTokenizer,
                 max_length: int) -> Dict[str, np.array]:
    text = text.lower()
    inputs = tokenizer.encode_plus(text,
                                   "",
                                   add_special_tokens=True,
                                   max_length=max_length)
    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
    attention_mask = [1] * len(input_ids)

    padding_length = max_length - len(input_ids)
    input_ids = input_ids + ([0] * padding_length)
    attention_mask = attention_mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    return {
        "input_ids": np.array(input_ids),
        "token_type_ids": np.array(token_type_ids),
        "attention_mask": np.array(attention_mask),
    }
Exemple #25
0
    def encode(self, tokenizer: BertTokenizer) -> Tuple[np.ndarray, ...]:
        """
        Encode this example as BERT input

        Args:
            tokenizer: BERT tokenizer to use for encoding

        Returns:
            Tuple of encoded BERT inputs: label, input_ids, attention_mask, token_type_ids
        """
        inputs = tokenizer.encode_plus(
            self.question, self.answer, add_special_tokens=True, max_length=MAX_SEQUENCE_LENGTH, pad_to_max_length=True
        )

        label = np.array(self.label)
        input_ids = np.array(inputs.input_ids)
        attention_mask = np.array(inputs.attention_mask)
        token_type_ids = np.array(inputs.token_type_ids)

        return label, input_ids, attention_mask, token_type_ids
Exemple #26
0
def convert_ocnli_example(ex_idx, example: OcnliExample, max_seq_len,
                          tokenizer: BertTokenizer):
    """
    convert trigger examples to trigger features
    """
    set_type = example.set_type
    text_a = example.text_a
    text_b = example.text_b
    raw_label = example.label

    tokens_a = fine_grade_tokenize(text_a, tokenizer)
    tokens_b = fine_grade_tokenize(text_b, tokenizer)

    labels = raw_label

    encode_dict = tokenizer.encode_plus(text=tokens_a,
                                        text_pair=tokens_b,
                                        max_length=max_seq_len,
                                        pad_to_max_length=True,
                                        is_pretokenized=True,
                                        return_token_type_ids=True,
                                        return_attention_mask=True)

    token_ids = encode_dict['input_ids']
    attention_masks = encode_dict['attention_mask']
    token_type_ids = encode_dict['token_type_ids']

    if ex_idx < 3 and set_type == 'train':
        logger.info(f"*** {set_type}_example-{ex_idx} ***")
        logger.info(f'text_a: {" ".join(tokens_a)}')
        logger.info(f"token_ids: {token_ids}")
        logger.info(f"attention_masks: {attention_masks}")
        logger.info(f"token_type_ids: {token_type_ids}")

    feature = OcnliFeature(token_ids=token_ids,
                           attention_masks=attention_masks,
                           token_type_ids=token_type_ids,
                           labels=labels)

    return feature
Exemple #27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--device",
                        default="0",
                        type=str,
                        required=False,
                        help="生成设备")
    parser.add_argument("--length",
                        default=-1,
                        type=int,
                        required=False,
                        help="生成长度")
    parser.add_argument("--batch_size",
                        default=1,
                        type=int,
                        required=False,
                        help="生成的batch size")
    parser.add_argument("--nsamples",
                        default=10,
                        type=int,
                        required=False,
                        help="生成几个样本")
    parser.add_argument("--temperature",
                        default=1,
                        type=float,
                        required=False,
                        help="生成温度")
    parser.add_argument("--topk",
                        default=8,
                        type=int,
                        required=False,
                        help="最高几选一")
    parser.add_argument("--topp",
                        default=0,
                        type=float,
                        required=False,
                        help="最高积累概率")
    parser.add_argument(
        "--model_config",
        default="config/model_config.json",
        type=str,
        required=False,
        help="模型参数",
    )
    parser.add_argument(
        "--tokenizer_path",
        default="vocab/vocab.txt",
        type=str,
        required=False,
        help="词表路径",
    )
    parser.add_argument(
        "--model_path",
        default="model/epoch=0-step=99.ckpt",
        type=str,
        required=False,
        help="模型路径",
    )
    parser.add_argument("--prefix",
                        default="我",
                        type=str,
                        required=False,
                        help="生成文章的开头")
    parser.add_argument("--no_wordpiece",
                        action="store_true",
                        help="不做word piece切词")
    parser.add_argument("--segment", action="store_true", help="中文以词为单位")
    parser.add_argument("--fast_pattern",
                        action="store_true",
                        help="采用更加快的方式生成文本")
    parser.add_argument("--save_samples", action="store_true", help="保存产生的样本")
    parser.add_argument("--save_samples_path",
                        default=".",
                        type=str,
                        required=False,
                        help="保存样本的路径")
    parser.add_argument("--repetition_penalty",
                        default=1.0,
                        type=float,
                        required=False)

    args = parser.parse_args()
    print("args:\n" + args.__repr__())

    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    length = args.length
    batch_size = args.batch_size
    nsamples = args.nsamples
    temperature = args.temperature
    topk = args.topk
    topp = args.topp
    repetition_penalty = args.repetition_penalty

    device = "cuda" if torch.cuda.is_available() else "cpu"

    tokenizer = BertTokenizer(vocab_file=args.tokenizer_path)
    model_config = GPT2Config.from_json_file(args.model_config)
    model = GPT2LMHeadModel(config=model_config)
    state_dict = {
        key[6:]: value
        for key, value in torch.load(args.model_path, map_location="cpu")
        ["state_dict"].items()
    }
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()

    for i in range(10):
        raw_text = args.prefix
        encoded = tokenizer.encode_plus(raw_text)["input_ids"]
        out = sample_sequence(
            model,
            encoded,
            length=512,
            n_ctx=1024,
            tokenizer=tokenizer,
            temperature=temperature,
            top_k=topk,
            top_p=topp,
            repitition_penalty=repetition_penalty,
            device=device,
        )
        print(tokenizer.decode(out))
for sent in sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)

MAX_LEN = 64

# Tokenize all of the sentences and map the tokens to their word IDs.
input_ids = []
attention_masks = []
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                    sent,                      
                    add_special_tokens = True, 
                    max_length = 64,           
                    padding='max_length',
                    return_token_type_ids=False,
                    return_attention_mask = True,   
                    return_tensors = 'pt',     
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
Exemple #29
0
def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict:

    choices_features = []

    option: str
    for option in x["options"]:
        text_a = x["article"]
        if x["question"].find("_") != -1:
            text_b = x["question"].replace("_", option)
        else:
            text_b = x["question"] + " " + option

        # 1) tokenize a raw text,
        # 2) replace tokens with corresponding ids,
        # 3) insert special tokens for BERT.
        # Use BertTokenizer to encode (tokenize / indexize) two sentences.
        inputs = tokenizer.encode_plus(text_a,
                                       text_b,
                                       add_special_tokens=True,
                                       max_length=MAX_LEN)

        # Output of `tokenizer.encode_plus` is a dictionary.
        input_ids, token_type_ids = inputs["input_ids"], inputs[
            "token_type_ids"]

        # For BERT, we need `attention_mask` along with `input_ids` as input.
        attention_mask = [1] * len(input_ids)

        # Pad sequences.
        pad_token_id = tokenizer.pad_token_id
        padding_length = MAX_LEN - len(input_ids)

        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_id] * padding_length)

        #
        assert len(
            input_ids) == MAX_LEN, "Error with input length {} vs {}".format(
                len(input_ids), MAX_LEN)
        assert len(attention_mask
                   ) == MAX_LEN, "Error with input length {} vs {}".format(
                       len(attention_mask), MAX_LEN)
        assert len(token_type_ids
                   ) == MAX_LEN, "Error with input length {} vs {}".format(
                       len(token_type_ids), MAX_LEN)

        choices_features.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        })

    labels = label_map.get(x["answer"], -1)

    # Just a python list to `torch.tensor`
    label = torch.tensor(labels).long()

    # What we return will one instance in batch which `LightningModule.train_step` receives.
    return {
        "id":
        x["id"],
        "label":
        label,
        "input_ids":
        torch.tensor([cf["input_ids"] for cf in choices_features]),
        "attention_mask":
        torch.tensor([cf["attention_mask"] for cf in choices_features]),
        "token_type_ids":
        torch.tensor([cf["token_type_ids"] for cf in choices_features]),
    }
Exemple #30
0
def preprocess(tokenizer: BertTokenizer, x: Dict) -> Dict:
    # `x` contains that one sample from lineflow dataset.
    # Example:
    # {
    #    "id": "075e483d21c29a511267ef62bedc0461",
    #    "answer_key": "A",
    #    "options": {"A": "ignore",
    #      "B": "enforce",
    #      "C": "authoritarian",
    #      "D": "yell at",
    #      "E": "avoid"},
    #    "stem": "The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?"}
    # }

    # Use BertTokenizer to encode (tokenize / indexize) two sentences.
    inputs = tokenizer.encode_plus(
        x["string1"],
        x["string2"],
        add_special_tokens=True,
        max_length=MAX_LEN,
    )

    # Output of `tokenizer.encode_plus` is a dictionary.
    input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
    # For BERT, we need `attention_mask` along with `input_ids` as input.
    attention_mask = [1] * len(input_ids)
    # We are going to pad sequences.
    padding_length = MAX_LEN - len(input_ids)
    pad_id = tokenizer.pad_token_id
    input_ids = input_ids + ([pad_id] * padding_length)
    attention_mask = attention_mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([pad_id] * padding_length)

    input_ids, masked_lm_positions, masked_lm_ids = create_masked_lm_predictions(
        input_ids, masked_lm_prob, max_predictions_per_seq, tokenizer, rng)

    masked_lm_weights = [1.0] * len(masked_lm_ids)

    padding_length = max_predictions_per_seq - len(masked_lm_positions)
    masked_lm_positions = masked_lm_positions + ([0] * padding_length)
    masked_lm_ids = masked_lm_ids + ([pad_id] * padding_length)
    masked_lm_weights = masked_lm_weights + ([0.0] * padding_length)

    assert len(
        input_ids) == MAX_LEN, "Error with input length {} vs {}".format(
            len(input_ids), MAX_LEN)
    assert len(
        attention_mask) == MAX_LEN, "Error with input length {} vs {}".format(
            len(attention_mask), MAX_LEN)
    assert len(
        token_type_ids) == MAX_LEN, "Error with input length {} vs {}".format(
            len(token_type_ids), MAX_LEN)
    assert len(
        masked_lm_positions
    ) == max_predictions_per_seq, "Error with input length {} vs {}".format(
        len(masked_lm_positions), max_predictions_per_seq)
    assert len(
        masked_lm_ids
    ) == max_predictions_per_seq, "Error with input length {} vs {}".format(
        len(masked_lm_ids), max_predictions_per_seq)

    # Just a python list to `torch.tensor`
    label = torch.tensor(int(x["quality"])).long()
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    token_type_ids = torch.tensor(token_type_ids)
    masked_lm_positions = torch.tensor(masked_lm_positions)
    masked_lm_ids = torch.tensor(masked_lm_ids)
    masked_lm_weights = torch.tensor(masked_lm_weights)

    # What we return will one instance in batch which `LightningModule.train_step` receives.
    return {
        "label": label,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "token_type_ids": token_type_ids,
        "masked_lm_weights": masked_lm_weights,
        "masked_lm_positions": masked_lm_positions,
        "masked_lm_ids": masked_lm_ids
    }