Exemple #1
0
    def __init__(self, config, *args, **kwargs):

        tokenizer_config = config.tokenizer_config
        self._tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_config.type, **tokenizer_config.params
        )

        self._max_seq_length = config.max_seq_length
        self._probability = getattr(config, "mask_probability", 0.15)
Exemple #2
0
 def __init__(self):
     model = AutoModelForQuestionAnswering.from_pretrained("/model/model")
     tokenizer = AutoTokenizer.from_pretrained("/model/tokenizer")
     self.default_response = "Perhaps the answer is 42."
     self.predictor = pipeline("question-answering",
                               model=model,
                               tokenizer=tokenizer)
     with open("/mounts/bert_context/paragraph.txt") as f:
         self.context = f.read()
    def __init__(self, pretrained_model: str):
        self._pretrained_model = pretrained_model

        self._init_kwargs = {}
        self._kwargs = {}

        if pretrained_model.startswith('roberta-'):
            self._kwargs['add_prefix_space'] = True

        self._tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model, **self._init_kwargs)
    def load(cls,
             model_name: str,
             cache_tokenizer: bool = True) -> AutoTokenizer:
        if model_name in cls._cache:
            return PretrainedAutoTokenizer._cache[model_name]

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if cache_tokenizer:
            cls._cache[model_name] = tokenizer

        return tokenizer
Exemple #5
0
 def test_as_array_produces_token_sequence_roberta_sentence_pair(self):
     tokenizer = AutoTokenizer.from_pretrained("roberta-base")
     allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base")
     indexer = PretrainedTransformerIndexer(model_name="roberta-base")
     default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>"
     tokens = tokenizer.tokenize(default_format)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair(
         "AllenNLP is great!", "Really it is!")
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
     assert indexed["key"] == expected_ids
Exemple #6
0
 def __init__(self,
              model_name: str,
              namespace: str = "tags",
              token_min_padding_length: int = 0) -> None:
     super().__init__(token_min_padding_length)
     self._namespace = namespace
     self._tokenizer = AutoTokenizer.from_pretrained(model_name)
     self._padding_value = self._tokenizer.convert_tokens_to_ids(
         [self._tokenizer.pad_token])[0]
     logger.info(
         f"Using token indexer padding value of {self._padding_value}")
     self._added_to_vocabulary = False
Exemple #7
0
 def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
     allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
     indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
     default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]"
     tokens = tokenizer.tokenize(default_format)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair(
         "AllenNLP is great!", "Really it is!")
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
    def __init__(self,
                 data_path,
                 img_path,
                 tsv_path,
                 tr_name,
                 max_seq_len,
                 max_seq_len_title=None,
                 train=True):
        super().__init__()
        # self.name = data_path
        # self.splits = splits.split(",")
        self.path = data_path
        self.img_path = img_path
        self.train = train
        self.tokenizer = AutoTokenizer.from_pretrained(tr_name)
        self.max_seq_len = max_seq_len
        self.max_seq_len_title = max_seq_len_title
        # Loading datasets to data
        self.raw_data = []
        # for split in self.splits:
        #     path = os.path.join("data/", f"{split}.jsonl")
        #     self.raw_data.extend(
        #             [json.loads(jline) for jline in open(path, "r").read().split('\n')]
        #     )
        # print("Load %d data from split(s) %s." % (len(self.raw_data), self.name))
        # self.raw_data  = [json.loads(jline) for jline in open(self.path,"r").read().split('\n')]
        self.raw_data = [json.loads(jline) for jline in open(self.path, "r")]

        # List to dict (for evaluation and others)
        self.id2datum = {datum["id"]: datum for datum in self.raw_data}

        # Loading detection features to img_data
        img_data = []

        # path = "data/HM_img.tsv"
        img_data.extend(load_obj_tsv(tsv_path, self.id2datum.keys()))

        # Convert img list to dict
        self.imgid2img = {}
        for img_datum in img_data:
            # Adding int here to convert 0625 to 625
            self.imgid2img[int(img_datum['img_id'])] = img_datum

        # Only keep the data with loaded image features
        self.data = []
        for datum in self.raw_data:
            # In HM the Img Id field is simply "id"
            if datum['id'] in self.imgid2img:
                self.data.append(datum)

        print("Use %d data in torch dataset" % (len(self.data)))
        print()
Exemple #9
0
 def test_as_array_produces_token_sequence_roberta(self):
     tokenizer = AutoTokenizer.from_pretrained("roberta-base")
     allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base")
     indexer = PretrainedTransformerIndexer(model_name="roberta-base")
     string_specials = "<s> AllenNLP is great </s>"
     string_no_specials = "AllenNLP is great"
     tokens = tokenizer.tokenize(string_specials)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     # tokens tokenized with our pretrained tokenizer have indices in them
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
     assert indexed["key"] == expected_ids
Exemple #10
0
 def __init__(
     self,
     model_name: str,
     add_special_tokens: bool = True,
     max_length: int = None,
     stride: int = 0,
     truncation_strategy: str = "longest_first",
 ) -> None:
     self._tokenizer = AutoTokenizer.from_pretrained(model_name)
     self._add_special_tokens = add_special_tokens
     self._max_length = max_length
     self._stride = stride
     self._truncation_strategy = truncation_strategy
Exemple #11
0
 def test_as_array_produces_token_sequence_bert_cased(self):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
     allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
     indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
     string_specials = "[CLS] AllenNLP is great [SEP]"
     string_no_specials = "AllenNLP is great"
     tokens = tokenizer.tokenize(string_specials)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     # tokens tokenized with our pretrained tokenizer have indices in them
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
Exemple #12
0
 def check_vocab_size(model_name: str):
     namespace = "tags"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_vocab_size(
         namespace=namespace) == tokenizer.vocab_size
Exemple #13
0
 def test_transformers_vocabs_added_correctly(self):
     namespace, model_name = "tags", "roberta-base"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_token_to_index_vocabulary(
         namespace=namespace) == tokenizer.encoder
    def __init__(self, model_name: str, max_length: int = None) -> None:
        super().__init__()
        self.transformer_model = AutoModel.from_pretrained(model_name)
        self._max_length = max_length
        # I'm not sure if this works for all models; open an issue on github if you find a case
        # where it doesn't work.
        self.output_dim = self.transformer_model.config.hidden_size

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        (
            self._num_added_start_tokens,
            self._num_added_end_tokens,
        ) = PretrainedTransformerIndexer.determine_num_special_tokens_added(tokenizer)
        self._num_added_tokens = self._num_added_start_tokens + self._num_added_end_tokens
Exemple #15
0
    def __init__(self, config, *args, **kwargs):
        # https://huggingface.co/transformers/model_doc/xlmroberta.html
        # roberta is with different tokenization of above default (bert)
        tokenizer_config = config.tokenizer_config
        self._tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_config.type, **tokenizer_config.params)

        self._CLS_TOKEN = self._tokenizer.bos_token  # <s>
        self._SEP_TOKEN = self._tokenizer.sep_token  # </s>
        self._MASK_TOKEN = self._tokenizer.mask_token  # <mask>
        self._PAD_TOKEN_ID = self._tokenizer.pad_token_id  # 1

        self._max_seq_length = config.max_seq_length
        self._probability = getattr(config, "mask_probability", 0.15)
Exemple #16
0
    def load_datasets(self):
        model_name_or_path = "monologg/koelectra-base-discriminator"
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

        utters, label_map, answers = self.utters, self.label_map, self.answers

        def gen_datasets(split=0.1):
            train_dataset, test_dataset = [], []
            train_label, test_label = [], []

            for l, u in utters.items():
                assert len(u) > 1, "# of utterences per label should be > 1"
                labels = [l] * len(u)
                ix = max(1, int(len(u) * split))
                test_dataset.extend(u[:ix])
                test_label.extend(labels[:ix])
                train_dataset.extend(u[ix:])
                train_label.extend(labels[ix:])
            return train_dataset, train_label, test_dataset, test_label

        train_dataset, train_label, test_dataset, test_label = gen_datasets()

        train_dataset = tokenizer(train_dataset,
                                  return_tensors='pt',
                                  padding='max_length',
                                  truncation=True)
        test_dataset = tokenizer(test_dataset,
                                 return_tensors='pt',
                                 padding='max_length',
                                 truncation=True)

        train_dataset['labels'] = train_label
        test_dataset['labels'] = test_label

        keys = list(train_dataset.keys())
        train_dataset = [
            dict(zip(keys, v)) for v in zip(*train_dataset.values())
        ]
        test_dataset = [
            dict(zip(keys, v)) for v in zip(*test_dataset.values())
        ]

        train_dataset = WellnessDataset(train_dataset)
        test_dataset = WellnessDataset(test_dataset)

        return train_dataset, test_dataset
    def compute_metrics(pred):
        labels_ids = pred.label_ids
        pred_ids = pred.predictions
        tokenizer = AutoTokenizer.from_pretrained(args.tr)
        print(pred_ids)
        print(labels_ids)
        label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        labels_ids[labels_ids == -100] = tokenizer.pad_token_id
        

        rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

        return {
            "rouge2_precision": round(rouge_output.precision, 4),
            "rouge2_recall": round(rouge_output.recall, 4),
            "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
        }
    def __init__(
        self,
        model_name: str,
        add_special_tokens: bool = True,
        max_length: Optional[int] = None,
        stride: int = 0,
        truncation_strategy: str = "longest_first",
        calculate_character_offsets: bool = False,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
    ) -> None:
        if tokenizer_kwargs is None:
            tokenizer_kwargs = {}
        else:
            tokenizer_kwargs = tokenizer_kwargs.copy()
        if "use_fast" in tokenizer_kwargs:
            if tokenizer_kwargs["use_fast"]:
                logger.warning(
                    "Fast huggingface tokenizers are known to break in certain scenarios."
                )
        else:
            tokenizer_kwargs["use_fast"] = False
        # As of transformers==2.8.0, fast tokenizers are broken.
        self.tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                       **tokenizer_kwargs)

        # Huggingface tokenizers have different ways of remembering whether they lowercase or not. Detecting it
        # this way seems like the least brittle way to do it.
        tokenized = self.tokenizer.tokenize(
            "A")  # Use a single character that won't be cut into word pieces.
        detokenized = " ".join(tokenized)
        self._tokenizer_lowercases = "a" in detokenized

        self._add_special_tokens = add_special_tokens
        self._max_length = max_length
        self._stride = stride
        self._truncation_strategy = truncation_strategy
        self._calculate_character_offsets = calculate_character_offsets

        (
            self.num_added_start_tokens,
            self.num_added_middle_tokens,
            self.num_added_end_tokens,
        ) = self._determine_num_special_tokens_added()
Exemple #19
0
def run_inference(ckpt_path=None):
    ckpt_path = "lightning_logs/version_0/checkpoints/epoch=24.ckpt"

    import pprint
    import random
    parser = ArgumentParser()
    parser = Trainer.add_argparse_args(parser)
    parser = WellnessClassifier.add_model_specific_args(parser)
    args = parser.parse_args()

    model = WellnessClassifier(args, 359)
    ckpt = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
    model.load_state_dict(ckpt['state_dict'])

    model_name_or_path = "monologg/koelectra-base-discriminator"
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
    utters, label_map, answers = prep.load()
    label_map = {v: k for k, v in label_map.items()}

    samples = [
        "벽에 머리를 부딪히는 느낌이야", "허리가 아파서 움직임이 어렵네ㅠㅠ", "집중력도 떨어지고 기분이 좋지 않아",
        "나는 화가 통제가 안돼!", "히잉?", "나 자해 할거야", "팔다리가 너무 저려", "방에만 있고 싶어",
        "스트레스 너무 많이 받아서 잠이 안와", "난바부야 기억을 하나두 못하겠어", "다이어트 하고싶은데 맘처럼 안되네",
        "요즘은 이상한 생각이 많이 들어", "부정적인 생각이 많이 드네", "사고 휴유증이 있는걸까", "체력이 떨어져서 문제야",
        "으악! 꽥!", "요즘 비둘기 무서워", "감정이 왔다갔다해요.", "화가 많이 날때는 감정 조절이 안되어여",
        "요즘 잠이 안와요", "입맛도 통 없구", "기분이 우울해서 큰일이야", "나는 아무것도 잘한게 없는걸?",
        "모든걸 내 마음대로 하고 싶을 때 있잖아", "무엇이 불안한지 잘 모르겠어"
    ]
    model.eval()
    inputs_ = tokenizer(samples,
                        return_tensors='pt',
                        padding='max_length',
                        truncation=True)
    ixs = torch.argmax(model(**inputs_)[0], dim=-1).tolist()
    res = [random.choice(answers.get(ix, ['None'])) for ix in ixs]
    labels = [label_map[ix] for ix in ixs]

    for s, l, r in zip(samples, labels, res):
        print(f"{l}|{s}=>{r}")
Exemple #20
0
    def test_long_sequence_splitting(self):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-uncased")
        indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased",
                                               max_length=4)
        string_specials = "[CLS] AllenNLP is great [SEP]"
        string_no_specials = "AllenNLP is great"
        tokens = tokenizer.tokenize(string_specials)
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        assert len(
            expected_ids) == 7  # just to make sure it's what we're expecting
        cls_id, sep_id = expected_ids[0], expected_ids[-1]
        expected_ids = (expected_ids[:3] + [sep_id, cls_id] +
                        expected_ids[3:5] + [sep_id, cls_id] +
                        expected_ids[5:])

        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        assert indexed["token_ids"] == expected_ids
        assert indexed["segment_concat_mask"] == [1] * len(expected_ids)
        assert indexed["mask"] == [1] * 7  # original length
    def __init__(self,
                 model_name: str,
                 namespace: str = "tags",
                 max_length: int = None,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self._namespace = namespace
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)
        self._added_to_vocabulary = False

        (
            self._num_added_start_tokens,
            self._num_added_end_tokens,
        ) = self.__class__.determine_num_special_tokens_added(self._tokenizer)

        self._max_length = max_length
        if self._max_length is not None:
            self._effective_max_length = (  # we need to take into account special tokens
                self._max_length - self._tokenizer.num_added_tokens())
            if self._effective_max_length <= 0:
                raise ValueError(
                    "max_length needs to be greater than the number of special tokens inserted."
                )
    def __init__(
        self,
        model_name: str,
        add_special_tokens: bool = True,
        max_length: int = None,
        stride: int = 0,
        truncation_strategy: str = "longest_first",
        calculate_character_offsets: bool = False,
    ) -> None:
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Huggingface tokenizers have different ways of remembering whether they lowercase or not. Detecting it
        # this way seems like the least brittle way to do it.
        tokenized = self._tokenizer.tokenize(
            "FOO"
        )  # Use a short word that's unlikely to be cut into word pieces.
        detokenized = " ".join(tokenized)
        self._tokenizer_lowercases = "foo" in detokenized

        self._add_special_tokens = add_special_tokens
        self._max_length = max_length
        self._stride = stride
        self._truncation_strategy = truncation_strategy
        self._calculate_character_offsets = calculate_character_offsets
Exemple #23
0
 def __init__(self, model_name):
     self.tokenizer = AutoTokenizer.from_pretrained(model_name)
     self.model = BertModel.from_pretrained(model_name).eval()
     self.model.cuda()
Exemple #24
0
    def __init__(self,
                 model_name: str,
                 do_lowercase: bool,
                 namespace: str = "tags",
                 token_min_padding_length: int = 0,
                 max_pieces: int = 512,
                 use_starting_offsets: bool = True,
                 truncate_long_sequences: bool = True,
                 start_sub_words: List[str] = None,
                 end_sub_words: List[str] = None,
                 separator_sub_word: str = "[SEP]",
                 never_lowercase: List[str] = None) -> None:
        super().__init__(token_min_padding_length)
        if model_name.endswith("-cased") and do_lowercase:
            logger.warning("Your pretrained model appears to be cased, "
                           "but your indexers is lowercasing tokens.")
        elif model_name.endswith("-uncased") and not do_lowercase:
            logger.warning("Your pretrained model appears to be uncased, "
                           "but your indexers is not lowercasing tokens.")
        self._model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name, do_lower_case=do_lowercase)
        self._namespace = namespace
        self._added_to_vocabulary = False
        self._padding_value = self.tokenizer.convert_tokens_to_ids(
            [self.tokenizer.pad_token])[0]
        logger.info(
            f"Using token indexers padding value of {self._padding_value}")

        self._never_lowercase = never_lowercase or []
        self._use_starting_offsets = use_starting_offsets
        self._max_pieces = max_pieces
        self._truncate_long_sequences = truncate_long_sequences
        self._do_lowercase = do_lowercase

        if start_sub_words:
            self._start_sub_words = start_sub_words
        else:
            if 'roberta' in model_name:
                self._start_sub_words = ['<s>']
            elif 'bert' in model_name:
                self._start_sub_words = ['[CLS]']
            elif 'xlm' in model_name:
                self._start_sub_words = ['</s>']
            elif 'gpt' in model_name or 'transfo' in model_name:
                self._start_sub_words = []
            else:
                raise ValueError("strange input")

        if end_sub_words:
            self._end_sub_words = end_sub_words
        else:
            if 'roberta' in model_name or 'xlm' in model_name:
                self._end_sub_words = ['</s>']
            elif 'bert' in model_name:
                self._end_sub_words = ['[SEP]']
            elif 'gpt' in model_name or 'transfo' in model_name:
                self._end_sub_words = []
            else:
                raise ValueError("strange input")
        # sub_word padding的时候用上面的padding_value,其它项进行padding的时候,用这个
        self._other_padding_value = 0

        self._start_sub_word_ids = [
            self.tokenizer.convert_tokens_to_ids(sub_word)
            for token in (self._start_sub_words or [])
            for sub_word in self.tokenizer.tokenize(token)
        ]

        self._end_sub_word_ids = [
            self.tokenizer.convert_tokens_to_ids(sub_word)
            for token in (self._end_sub_words or [])
            for sub_word in self.tokenizer.tokenize(token)
        ]

        self._separator_ids = [
            self.tokenizer.convert_tokens_to_ids(sub_word)
            for sub_word in self.tokenizer.tokenize(separator_sub_word)
        ]
def single_format_to_bert(params):
    # length_limit = 510
    read_json_file, wt_pt_file, oracle_mode, oracle_sent_num, bert_model_name, min_src_ntokens, max_src_ntokens, min_nsents, max_nsents, length_limit = params
    # print(read_json_file)
    # TODO keep file exist check
    from transformers.tokenization_auto import AutoTokenizer
    # tokenizer = plm_tokenizers[bert_model_name]
    tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
    if os.path.exists(wt_pt_file):
        logger.info('Ignore %s' % wt_pt_file)
        return
    print("working on {}".format(wt_pt_file))
    bert_data = MSBertData(min_src_ntokens, max_src_ntokens, min_nsents, max_nsents, tokenizer, bert_model_name)

    logger.info('Processing %s' % read_json_file)
    jobs = json.load(open(read_json_file))
    datasets = []
    for d in jobs:
        disco_dep = d['disco_dep']

        # disco_graph_links = d['disco_graph_links']
        disco_links = d['disco_link']  #####

        tgt_list_str = d['tgt_list_str']
        tgt_tok_list_list_str = d['tgt_tok_list_list_str']

        span = d['disco_span']
        sent, doc_id, coref = d['sent'], d['doc_id'], d['coref']

        # First of all, assemble data and  LENGTH truncation
        budget = 0
        disco_bag = []
        sent_bag = []
        original_disco_txt_list_of_str = []
        for idx in range(len(sent)):

            this_sent = sent[idx]
            this_disco = span[idx]
            this_tokens = this_sent['tokens']
            this_tokens = [clean(x.lower()) for x in this_tokens]
            this_coref = this_sent['corefs']
            original_word_len = len(this_tokens)

            tmp_disco_bag = []
            for disc in this_disco:

                # tree_node = next(gen)
                start, end = disc
                disc_piece = DiscourseUnit(len(disco_bag) + len(tmp_disco_bag), idx, rel_start=start, rel_end=end)

                # disc_piece.add_dep_info(tree_node)
                disc_piece.add_dep(disco_dep)
                for jdx in range(start, end + 1):
                    _toks = this_tokens[jdx]

                    disc_piece.add_word(_toks,tokenizer)

                    # look at word jdx, see if any coref mentions applied.
                    _cor = this_coref[jdx]
                    if _cor != []:
                        disc_piece.add_mention(jdx)  # add the orinigla index of the word in the sentence
                        for _c in _cor:
                            disc_piece.add_coref(_c)
                    # finish loading coref
                tmp_disco_bag.append(disc_piece)
                budget += disc_piece.get_bert_wp_length()
            budget += 2
            if budget > length_limit:
                break
            else:
                disco_bag += tmp_disco_bag
                original_disco_txt_list_of_str += [x.get_readable_words_as_list() for x in tmp_disco_bag]
                s = SentUnit(idx, this_tokens, [x.bert_word_pieces for x in tmp_disco_bag], tmp_disco_bag)
                sent_bag.append(s)

        effective_disco_number = len(disco_bag)
        # clean disco_graph_links
        disco_graph_links = [(tup[0] - 1, tup[1] - 1, tup[2]) for tup in disco_links if
                             (tup[0] <= effective_disco_number and tup[1] <= effective_disco_number)]

        disc_oracle_ids, disc_spans, disc_coref = bert_data.preprocess_disc(disco_bag, tgt_tok_list_list_str)

        src_tok_index, sent_oracle_labels, segments_ids, \
        cls_ids, original_sent_txt_list_of_str, tgt_txt = bert_data.preprocess_sent(sent_bag,bert_model_name,
                                                                                    summary=tgt_tok_list_list_str)
        # TO have: src_subtoken_idxs [for bert encoder], labels[sent level and discourse level],
        # segments_ids[for bert encoder],
        # cls_ids[for sent level],
        # span indexs [ for discourse level]
        # entity coref linking edge [ sent level and discourse level]
        # discourse linking edge [discourse level only]
        # src_txt, tgt_txt

        # provide two versions, one based on discourse, one without.
        # w. multiple oracle

        # prepare discourse data
        # oracle is computed based on discourse

        # prepare sent data

        b_data_dict = {"src": src_tok_index,
                       "labels": sent_oracle_labels,
                       "segs": segments_ids,
                       'clss': cls_ids,
                       'sent_txt': original_sent_txt_list_of_str,
                       'disco_txt': original_disco_txt_list_of_str,
                       # "tgt_txt": tgt_txt,
                       "tgt_list_str": tgt_list_str,  # unchanged reference summary for computing final score
                       "tgt_tok_list_list_str": tgt_tok_list_list_str,  # for oracle, tokenized
                       'd_labels': disc_oracle_ids,
                       'd_span': disc_spans,
                       'd_coref': disc_coref,
                       'd_graph': disco_graph_links,
                       'disco_dep': disco_dep,
                       'doc_id': doc_id

                       }
        if len(src_tok_index) < 15:
            continue
        datasets.append(b_data_dict)
    logger.info('Saving to %s' % wt_pt_file)
    torch.save(datasets, wt_pt_file)
    datasets = []
    gc.collect()
    def _reverse_engineer_special_tokens(
        self,
        token_a: str,
        token_b: str,
        model_name: str,
        tokenizer_kwargs: Optional[Dict[str, Any]],
    ):
        # storing the special tokens
        self.sequence_pair_start_tokens = []
        self.sequence_pair_mid_tokens = []
        self.sequence_pair_end_tokens = []
        # storing token type ids for the sequences
        self.sequence_pair_first_token_type_id = None
        self.sequence_pair_second_token_type_id = None

        # storing the special tokens
        self.single_sequence_start_tokens = []
        self.single_sequence_end_tokens = []
        # storing token type id for the sequence
        self.single_sequence_token_type_id = None

        # Reverse-engineer the tokenizer for two sequences
        tokenizer_with_special_tokens = AutoTokenizer.from_pretrained(
            model_name, add_special_tokens=True, **tokenizer_kwargs)
        dummy_output = tokenizer_with_special_tokens.encode_plus(
            token_a,
            token_b,
            add_special_tokens=True,
            return_token_type_ids=True,
            return_attention_mask=False,
        )
        dummy_a = self.tokenizer.encode(token_a,
                                        add_special_tokens=False,
                                        add_prefix_space=True)[0]
        assert dummy_a in dummy_output["input_ids"]
        dummy_b = self.tokenizer.encode(token_b,
                                        add_special_tokens=False,
                                        add_prefix_space=True)[0]
        assert dummy_b in dummy_output["input_ids"]
        assert dummy_a != dummy_b

        seen_dummy_a = False
        seen_dummy_b = False
        for token_id, token_type_id in zip(dummy_output["input_ids"],
                                           dummy_output["token_type_ids"]):
            if token_id == dummy_a:
                if seen_dummy_a or seen_dummy_b:  # seeing a twice or b before a
                    raise ValueError(
                        "Cannot auto-determine the number of special tokens added."
                    )
                seen_dummy_a = True
                assert (
                    self.sequence_pair_first_token_type_id is None
                    or self.sequence_pair_first_token_type_id == token_type_id
                ), "multiple different token type ids found for the first sequence"
                self.sequence_pair_first_token_type_id = token_type_id
                continue

            if token_id == dummy_b:
                if seen_dummy_b:  # seeing b twice
                    raise ValueError(
                        "Cannot auto-determine the number of special tokens added."
                    )
                seen_dummy_b = True
                assert (
                    self.sequence_pair_second_token_type_id is None
                    or self.sequence_pair_second_token_type_id == token_type_id
                ), "multiple different token type ids found for the second sequence"
                self.sequence_pair_second_token_type_id = token_type_id
                continue

            token = Token(
                tokenizer_with_special_tokens.convert_ids_to_tokens(token_id),
                text_id=token_id,
                type_id=token_type_id,
            )
            if not seen_dummy_a:
                self.sequence_pair_start_tokens.append(token)
            elif not seen_dummy_b:
                self.sequence_pair_mid_tokens.append(token)
            else:
                self.sequence_pair_end_tokens.append(token)

        assert (len(self.sequence_pair_start_tokens) +
                len(self.sequence_pair_mid_tokens) +
                len(self.sequence_pair_end_tokens)
                ) == self.tokenizer.num_special_tokens_to_add(pair=True)

        # Reverse-engineer the tokenizer for one sequence
        dummy_output = tokenizer_with_special_tokens.encode_plus(
            token_a,
            add_special_tokens=True,
            return_token_type_ids=True,
            return_attention_mask=False,
            add_prefix_space=True,
        )

        seen_dummy_a = False
        for token_id, token_type_id in zip(dummy_output["input_ids"],
                                           dummy_output["token_type_ids"]):
            if token_id == dummy_a:
                if seen_dummy_a:
                    raise ValueError(
                        "Cannot auto-determine the number of special tokens added."
                    )
                seen_dummy_a = True
                assert (
                    self.single_sequence_token_type_id is None
                    or self.single_sequence_token_type_id == token_type_id
                ), "multiple different token type ids found for the sequence"
                self.single_sequence_token_type_id = token_type_id
                continue

            token = Token(
                tokenizer_with_special_tokens.convert_ids_to_tokens(token_id),
                text_id=token_id,
                type_id=token_type_id,
            )
            if not seen_dummy_a:
                self.single_sequence_start_tokens.append(token)
            else:
                self.single_sequence_end_tokens.append(token)

        assert (len(self.single_sequence_start_tokens) +
                len(self.single_sequence_end_tokens)
                ) == self.tokenizer.num_special_tokens_to_add(pair=False)
Exemple #27
0
def main():
    # model files check and download
    check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)

    inputs = {
        "question": args.question,
        "context": args.context,
    }

    logger.info("Question : " + str(args.question))
    logger.info("Context : " + str(args.context))

    # Set defaults values
    handle_impossible_answer = False
    topk = 1
    max_answer_len = 15

    tokenizer = AutoTokenizer.from_pretrained('deepset/roberta-base-squad2')

    # Convert inputs to features
    examples = []

    if True:  # for i, item in enumerate(inputs):
        item = inputs
        logger.debug(item)
        if isinstance(item, dict):
            if any(k not in item for k in ["question", "context"]):
                raise KeyError("You need to provide a dictionary with keys "
                               "{question:..., context:...}")

            example = create_sample(**item)
            examples.append(example)

    features_list = [
        squad_convert_examples_to_features(
            examples=[example],
            tokenizer=tokenizer,
            max_seq_length=384,
            doc_stride=128,
            max_query_length=64,
            padding_strategy=PaddingStrategy.DO_NOT_PAD.value,
            is_training=False,
            tqdm_enabled=False,
        ) for example in examples
    ]

    all_answers = []
    for features, example in zip(features_list, examples):
        model_input_names = tokenizer.model_input_names + ["input_ids"]
        fw_args = {
            k: [feature.__dict__[k] for feature in features]
            for k in model_input_names
        }
        fw_args = {k: np.array(v) for (k, v) in fw_args.items()}

        logger.debug("Input" + str(fw_args))
        logger.debug("Shape" + str(fw_args["input_ids"].shape))

        net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id)
        net.set_input_shape(fw_args["input_ids"].shape)
        if args.benchmark:
            logger.info('BENCHMARK mode')
            for i in range(5):
                start = int(round(time.time() * 1000))
                outputs = net.predict(fw_args)
                end = int(round(time.time() * 1000))
                logger.info("\tailia processing time {} ms".format(end -
                                                                   start))
        else:
            outputs = net.predict(fw_args)

        logger.debug("Output" + str(outputs))
        start, end = outputs[0:2]

        min_null_score = 1000000  # large and positive
        answers = []
        for (feature, start_, end_) in zip(features, start, end):
            # Ensure padded tokens & question tokens cannot belong
            # to the set of candidate answers.
            undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & \
                feature.attention_mask

            # Generate mask
            undesired_tokens_mask = undesired_tokens == 0.0

            # Make sure non-context indexes in the tensor cannot contribute
            # to the softmax
            start_ = np.where(undesired_tokens_mask, -10000.0, start_)
            end_ = np.where(undesired_tokens_mask, -10000.0, end_)

            # Normalize logits and spans to retrieve the answer
            start_ = np.exp(
                start_ -
                np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
            end_ = np.exp(end_ -
                          np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))

            if handle_impossible_answer:
                min_null_score = min(min_null_score,
                                     (start_[0] * end_[0]).item())

            # Mask CLS
            start_[0] = end_[0] = 0.0

            starts, ends, scores = decode(start_, end_, topk, max_answer_len)
            char_to_word = np.array(example.char_to_word_offset)

            # Convert the answer (tokens) back to the original text
            t2org = feature.token_to_orig_map
            answers += [{
                "score":
                score.item(),
                "start":
                np.where(char_to_word == t2org[s])[0][0].item(),
                "end":
                np.where(char_to_word == t2org[e])[0][-1].item(),
                "answer":
                " ".join(example.doc_tokens[t2org[s]:t2org[e] + 1]),
            } for s, e, score in zip(starts, ends, scores)]

        if handle_impossible_answer:
            answers.append({
                "score": min_null_score,
                "start": 0,
                "end": 0,
                "answer": ""
            })

        answers = sorted(answers, key=lambda x: x["score"],
                         reverse=True)[:topk]
        all_answers += answers

    logger.info("Answer : " + str(all_answers))
    logger.info('Script finished successfully.')
Exemple #28
0
def pipeline(task: str,
             model: Optional = None,
             config: Optional[Union[str, PretrainedConfig]] = None,
             tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
             modelcard: Optional[Union[str, ModelCard]] = None,
             device=torch.device("cpu"),
             **kwargs) -> Pipeline:
    """
    Utility factory method to build a pipeline.
    Pipeline are made of:
        A Tokenizer instance in charge of mapping raw textual input to token
        A Model instance
        Some (optional) post processing for enhancing model's output
    Examples:
        pipeline('sentiment-analysis')
    """
    # Register all the supported task here
    SUPPORTED_TASKS = {
        "sentiment-analysis": {
            "impl": TextClassificationPipelineMod,
            "pt":
            AutoModelForSequenceClassification,  # if is_torch_available() else None,
            "default": {
                "model": {
                    "pt": "distilbert-base-uncased-finetuned-sst-2-english",
                },
                "config": "distilbert-base-uncased-finetuned-sst-2-english",
                "tokenizer": "distilbert-base-uncased",
            },
        },
    }

    # Retrieve the task
    if task not in SUPPORTED_TASKS:
        raise KeyError("Unknown task {}, available tasks are {}".format(
            task, list(SUPPORTED_TASKS.keys())))

    framework = "pt"  #get_framework(model)

    targeted_task = SUPPORTED_TASKS[task]
    task, model_class = targeted_task["impl"], targeted_task[framework]

    # Use default model/config/tokenizer for the task if no model is provided
    if model is None:
        models, config, tokenizer = tuple(targeted_task["default"].values())
        model = models[framework]

    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
        if isinstance(model,
                      str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
            tokenizer = model
        elif isinstance(config,
                        str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
            tokenizer = config
        else:
            # Impossible to guest what is the right tokenizer here
            raise Exception(
                "Impossible to guess which tokenizer to use. "
                "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer."
            )

    # Try to infer modelcard from model or config name (if provided as str)
    if modelcard is None:
        # Try to fallback on one of the provided string for model or config (will replace the suffix)
        if isinstance(model, str):
            modelcard = model
        elif isinstance(config, str):
            modelcard = config

    # Instantiate tokenizer if needed
    if isinstance(tokenizer, str):
        tokenizer = AutoTokenizer.from_pretrained(tokenizer)

    # Instantiate config if needed
    if isinstance(config, str):
        config = AutoConfig.from_pretrained(config)

    # Instantiate modelcard if needed
    if isinstance(modelcard, str):
        modelcard = ModelCard.from_pretrained(modelcard)

    # Instantiate model if needed
    if isinstance(model, str):
        # Handle transparent TF/PT model conversion
        model_kwargs = {}
        if framework == "pt" and model.endswith(".h5"):
            model_kwargs["from_tf"] = True
            logger.warning(
                "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
                "Trying to load the model with PyTorch.")

        model = model_class.from_pretrained(model,
                                            config=config,
                                            **model_kwargs)
        model = model.to(device)
    model.device = device
    return task(model=model,
                tokenizer=tokenizer,
                modelcard=modelcard,
                framework=framework,
                **kwargs)
Exemple #29
0
 def __init__(self, model_name: str, namespace: str = "tags", **kwargs) -> None:
     super().__init__(**kwargs)
     self._namespace = namespace
     self._tokenizer = AutoTokenizer.from_pretrained(model_name)
     self._added_to_vocabulary = False
Exemple #30
0
 def test_determine_num_special_tokens_added(self):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
     assert PretrainedTransformerIndexer.determine_num_special_tokens_added(
         tokenizer) == (1, 1)