コード例 #1
0
    def __init__(self,
                 data_dir: str,
                 tokenizer: PreTrainedTokenizer,
                 labels: List[str],
                 model_type: str,
                 max_seq_length: Optional[int] = None,
                 overwrite_cache=False,
                 mode: Split = Split.train,
                 local_rank=-1,
                 to_predict: Optional[str] = None):
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            data_dir,
            "cached_{}_{}_{}".format(mode, tokenizer.__class__.__name__,
                                     str(max_seq_length)),
        )

        if not to_predict:
            examples = read_examples_from_file(data_dir, mode)
        else:
            examples = [
                InputExample(i, sentence.split(),
                             [labels[0] for word in sentence.split()])
                for i, sentence in enumerate(to_predict)
            ]

        # The below is stated to ensure only one process executes the dataset. If this is not set,
        # we do have a posibility of messing up with the dataset during training
        with torch_distributed_zero_first(local_rank):
            # If a cache file of the features exists, load it into PyTorch
            if os.path.exists(cached_features_file) and not overwrite_cache:
                logger.info(
                    f"Loading features from cached file {cached_features_file}"
                )
                self.features = torch.load(cached_features_file)
            else:
                logger.info(
                    f"Converting features from dataset file at {data_dir}")
                self.features = convert_examples_to_features(
                    examples,
                    labels,
                    max_seq_length,
                    tokenizer,
                    cls_token_at_end=bool(model_type in ["xlnet"]),
                    # xlnet has a cls token at the end
                    cls_token=tokenizer.cls_token,
                    cls_token_segment_id=2 if model_type in ["xlnet"] else 0,
                    sep_token=tokenizer.sep_token,
                    sep_token_extra=bool(model_type in ["roberta"]),
                    # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                    pad_on_left=bool(tokenizer.padding_side == "left"),
                    pad_token=tokenizer.pad_token_id,
                    pad_token_segment_id=tokenizer.pad_token_type_id,
                    pad_token_label_id=self.pad_token_label_id,
                )
                if local_rank in [-1, 0]:
                    logger.info(
                        f"Saving features into cached file {cached_features_file}"
                    )
                    torch.save(self.features, cached_features_file)
コード例 #2
0
    def __init__(
        self,
        data_dir: str,
        category_num: int,
        tokenizer: PreTrainedTokenizer,
        task: str,
        max_seq_length: Optional[int] = None,
        overwrite_cache=False,
        mode: Split = Split.train,
        local_rank=-1,
    ):
        processor = processors[task]()
        processor.set_labels(category_num)

        cached_features_file = os.path.join(
            data_dir,
            "cached_{}_{}_{}_{}".format(
                mode.value,
                tokenizer.__class__.__name__,
                str(max_seq_length),
                task,
            ),
        )
        with torch_distributed_zero_first(local_rank):
            # Make sure only the first process in distributed training processes the dataset,
            # and the others will use the cache.

            if os.path.exists(cached_features_file) and not overwrite_cache:
                logger.info(
                    f"Loading features from cached file {cached_features_file}"
                )
                self.features = torch.load(cached_features_file)
            else:
                logger.info(
                    f"Creating features from dataset file at {data_dir}")
                label_list = processor.get_labels()
                if mode == Split.dev:
                    examples = processor.get_dev_examples(data_dir)
                elif mode == Split.test:
                    examples = processor.get_test_examples(data_dir)
                else:
                    examples = processor.get_train_examples(data_dir)
                logger.info("Training examples: %s", len(examples))
                # TODO clean up all this to leverage built-in features of tokenizers
                self.features = convert_examples_to_features(
                    examples,
                    label_list,
                    max_seq_length,
                    tokenizer,
                    pad_on_left=bool(tokenizer.padding_side == "left"),
                    pad_token=tokenizer.pad_token_id,
                    pad_token_segment_id=tokenizer.pad_token_type_id,
                )
                if local_rank in [-1, 0]:
                    logger.info("Saving features into cached file %s",
                                cached_features_file)
                    torch.save(self.features, cached_features_file)
コード例 #3
0
        def __init__(
            self,
            data_dir: str,
            tokenizer: PreTrainedTokenizer,
            labels: List[str],
            model_type: str,
            max_seq_length: Optional[int] = None,
            overwrite_cache=False,
            mode: Split = Split.train,
            local_rank=-1,
        ):
            # Load data features from cache or dataset file
            cached_features_file = os.path.join(
                data_dir,
                "cached_{}_{}_{}".format(mode.value,
                                         tokenizer.__class__.__name__,
                                         str(max_seq_length)),
            )

            with torch_distributed_zero_first(local_rank):
                # Make sure only the first process in distributed training processes the dataset,
                # and the others will use the cache.

                if os.path.exists(
                        cached_features_file) and not overwrite_cache:
                    logger.info(
                        f"Loading features from cached file {cached_features_file}"
                    )
                    self.features = torch.load(cached_features_file)
                else:
                    logger.info(
                        f"Creating features from dataset file at {data_dir}")
                    examples = read_examples_from_file(data_dir, mode)
                    # TODO clean up all this to leverage built-in features of tokenizers
                    self.features = convert_examples_to_features(
                        examples,
                        labels,
                        max_seq_length,
                        tokenizer,
                        cls_token_at_end=bool(model_type in ["xlnet"]),
                        # xlnet has a cls token at the end
                        cls_token=tokenizer.cls_token,
                        cls_token_segment_id=2
                        if model_type in ["xlnet"] else 0,
                        sep_token=tokenizer.sep_token,
                        sep_token_extra=bool(model_type in ["roberta"]),
                        # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                        pad_on_left=bool(tokenizer.padding_side == "left"),
                        pad_token=tokenizer.pad_token_id,
                        pad_token_segment_id=tokenizer.pad_token_type_id,
                        pad_token_label_id=self.pad_token_label_id,
                    )
                    if local_rank in [-1, 0]:
                        logger.info(
                            f"Saving features into cached file {cached_features_file}"
                        )
                        torch.save(self.features, cached_features_file)
コード例 #4
0
 def dataset(self,
             predict=False,
             limit_length: Optional[int] = None,
             evaluate=False,
             local_rank=-1) -> Dataset:
     with torch_distributed_zero_first(local_rank):
         dataset = self._data_store.load_dataset(
             limit_length, evaluate) if not predict else None
         if not dataset:
             features = self.__generate_features(limit_length, evaluate)
             dataset = TaskDataset(features)
             if not predict and local_rank in [-1, 0]:
                 self._data_store.save_dataset(dataset, evaluate)
         return dataset
コード例 #5
0
    def __init__(
            self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ctrl_code_tok: int,
            overwrite_cache=False, local_rank=-1,
    ):
        assert os.path.isfile(file_path)

        # Subtract 1 so we can prepend the CTRL code
        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) - 1

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename, ),
        )

        with torch_distributed_zero_first(local_rank):
            # Make sure only the first process in distributed training processes the dataset,
            # and the others will use the cache.

            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )

            else:
                logger.info(f"Creating features from dataset file at {directory}")

                self.examples = []
                with open(file_path, encoding="utf-8") as f:
                    text = f.read()

                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

                for i in trange(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                    self.examples.append(
                        tokenizer.build_inputs_with_special_tokens([ctrl_code_tok] + tokenized_text[i: i + block_size])
                    )
                # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
                # If your dataset is small, first you should loook for a bigger one :-) and second you
                # can change this behavior by adding (model specific) padding.

                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )
コード例 #6
0
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 split: str,
                 file_path: str,
                 block_size: int,
                 overwrite_cache=False,
                 local_rank=-1):
        block_size = 4096
        self.block_size = 4096

        directory = './processed_files'
        cached_features_file = os.path.join(
            directory,
            "cached_lm_{}_{}_{}".format(
                tokenizer.__class__.__name__,
                str(block_size),
                split,
            ),
        )

        with torch_distributed_zero_first(local_rank):
            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                self.examples = [
                    ex for ex in self.examples[:int(len(self.examples))]
                    if len(ex[0]) == self.block_size
                    and len(ex[1]) == self.block_size
                ]
                self.examples = [ex for ex in self.examples[:]]
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)

            else:
                input_path = './multinews/train.txt.src'
                logger.info(
                    f"Creating features from dataset file at {input_path}")
                encoder_func = select_words_to_mask_special_tokens_only_multiple_docs
                self.examples = []
                self.masking_samples = []
                corpus = read_in_train_set(input_path)
                ln = []
                for i in range(len(corpus)):
                    sample = corpus[i].strip()
                    articles = sample.split("story_separator_special_tag")
                    ln.append(articles)
                tokenizer.add_tokens(['<doc-s>'], special_tokens=True)
                tokenizer.add_tokens(['</doc-s>'], special_tokens=True)
                stats = []
                while len(stats) < 64 * 25 * 1000:
                    for topic in ln:
                        if len(topic) > 2:
                            s = random.sample(topic, len(topic))
                            examp, st = encoder_func(s, tokenizer, block_size)
                            self.examples.append(examp)
                            stats.append(st)
                # Uncomment for creating data for the random baseline
                # while len(self.examples) < 64*25*1000:
                #     s = random.sample(ln, 10)
                #     curr_false_topic = []
                #     for topic in s:
                #         curr_false_topic.append(random.sample(topic, 1)[0])
                #     examp, st = encoder_func(curr_false_topic, tokenizer, block_size)
                #     self.examples.append(examp)
                #     stats.append(st)
                start = time.time()
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    f"Saving features into cached file %s [took %.3f s]",
                    cached_features_file,
                    time.time() - start)
コード例 #7
0
ファイル: dataset.py プロジェクト: aicanhelp/ai-transformers
    def __init__(self,
                 args: DataArguments,
                 tokenizer: PreTrainedTokenizer,
                 processor: DataProcessor,
                 limit_length: Optional[int] = None,
                 evaluate=False,
                 local_rank=-1):
        self.args = args
        # Load data features from cache or dataset file
        cached_features_file = None
        if not args.predict:
            cached_features_file = os.path.join(
                processor.data_dir(),
                "cached_{}_{}_{}_{}".format(
                    "dev" if evaluate else "train",
                    tokenizer.__class__.__name__,
                    str(args.max_seq_length),
                    args.task_name,
                ),
            )
        with torch_distributed_zero_first(local_rank):
            # Make sure only the first process in distributed training processes the dataset,
            # and the others will use the cache.

            if not args.predict and os.path.exists(
                    cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                log.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)
            else:
                log.info(f"Creating features from dataset file")
                label_list = processor.get_labels()
                if label_list and args.task_name in [
                        "mnli", "mnli-mm"
                ] and tokenizer.__class__ in (
                        RobertaTokenizer,
                        RobertaTokenizerFast,
                        XLMRobertaTokenizer,
                ):
                    # HACK(label indices are swapped in RoBERTa pretrained model)
                    label_list[1], label_list[2] = label_list[2], label_list[1]
                examples = (processor.get_dev_examples()
                            if evaluate else processor.get_train_examples())
                if limit_length is not None:
                    examples = examples[:limit_length]
                self.features = _glue_convert_examples_to_features(
                    examples,
                    tokenizer,
                    processor,
                    max_length=args.max_seq_length,
                    label_list=label_list,
                    output_mode=self.args.model_mode_for_data,
                    progress_bar=args.progress_bar,
                    evaluate=evaluate,
                    num_print=self.args.show_feature_num)
                if not args.predict and local_rank in [-1, 0]:
                    log.info("Saving features into cached file %s",
                             cached_features_file)
                    start = time.time()
                    torch.save(self.features, cached_features_file)
                    # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                    log.info(
                        f"Saved features into cached file %s [took %.3f s]",
                        cached_features_file,
                        time.time() - start)