def get_data_bert(max_seq_length, batch_sizes):

    """
    Args:
        max_num_words: (int) Max number of words as input for the Tokenizer
        embedding_dim: (int) Embedding dim of the embeddings
        max_seq_length: (int) Max sequence length of the sentences
        batch_size: (int) Batch size for the DataLoader
        use_bert: (bool) Use the BERT model or another model
    Output:
        word_index, embedding_matrix, X_train, y_train, X_test, y_test
    """

    #Load data
    train, val, test = load_data()

    #Clean data

    X_train, y_train = clean_data(train)
    X_val, y_val = clean_data(val)
    X_test, y_test = clean_data(test)

    #Features data
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    train_examples = convert_examples_to_features(X_train, y_train, max_seq_length, tokenizer)
    val_examples = convert_examples_to_features(X_val, y_val, max_seq_length, tokenizer)
    test_examples = convert_examples_to_features(X_test, y_test, max_seq_length, tokenizer)

    #Data loaders
    train_dataloader = get_dataloader(train_examples, batch_sizes[0])
    val_dataloader = get_dataloader(val_examples, batch_sizes[1])
    test_dataloader = get_dataloader(test_examples, batch_sizes[2])

    return train_dataloader, val_dataloader, test_dataloader
コード例 #2
0
    def preprare_eval_examples(self):
        self.eval_examples = self.processor.get_dev_examples(
            self.args.data_dir, self.args.dev_file)

        input_length_arr = []
        if self.processor.is_pair():
            truncate_seq_pair = lambda tokens_a, tokens_b, max_length: self.processor.truncate_seq_pair(
                tokens_a, tokens_b, max_length)
            self.eval_features = convert_examples_to_features(
                self.eval_examples,
                self.label_list,
                self.args.max_seq_length,
                self.tokenizer,
                self.output_mode,
                self.logger,
                input_length_arr,
                truncate_seq_pair=truncate_seq_pair)
        else:
            self.eval_features = convert_examples_to_features(
                self.eval_examples, self.label_list, self.args.max_seq_length,
                self.tokenizer, self.output_mode, self.logger,
                input_length_arr)

        all_input_ids = torch.tensor([f.input_ids for f in self.eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor(
            [f.input_mask for f in self.eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor(
            [f.segment_ids for f in self.eval_features], dtype=torch.long)

        input_length_arr = np.array(input_length_arr)
        print("Eval input_length_arr: max={}, min={}, avg={}".format(
            np.max(input_length_arr), np.min(input_length_arr),
            np.mean(input_length_arr)))

        if self.output_mode == "classification":
            self.eval_all_label_ids = torch.tensor(
                [f.label_id for f in self.eval_features], dtype=torch.long)
        elif self.output_mode == "regression":
            self.eval_all_label_ids = torch.tensor(
                [f.label_id for f in self.eval_features], dtype=torch.float)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, self.eval_all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        self.eval_dataloader = DataLoader(eval_data,
                                          sampler=eval_sampler,
                                          batch_size=self.args.eval_batch_size)
コード例 #3
0
def load_examples(tokenizer, mode="train"):
    sst2 = Sst2Processor()
    # examples: list of InputExample objects
    if mode == "train":
        examples = sst2.get_train_examples()
    elif mode == "dev":
        examples = sst2.get_dev_examples()
    else:
        examples = None
    # features: list of InputFeatures
    features = convert_examples_to_features(examples,
                                            tokenizer,
                                            sst2.get_labels(),
                                            max_length=args.max_seq_length)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.int64)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.int64)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.int64)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.int64)
    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_labels)
    return dataset
コード例 #4
0
ファイル: run_pl.py プロジェクト: ksboy/sentiment_analysis
    def prepare_data(self):
        "Called to initialize data. Use the call to construct features"
        args = self.hparams
        processor = ChnSentiCorpProcessor()
        self.labels = processor.get_labels()

        for mode in ["train", "dev"]:
            cached_features_file = self._feature_file(mode)
            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                logger.info("Loading features from cached file %s", cached_features_file)
                features = torch.load(cached_features_file)
            else:
                logger.info("Creating features from dataset file at %s", args.data_dir)
                examples = (
                    processor.get_dev_examples(args.data_dir)
                    if mode == "dev"
                    else processor.get_train_examples(args.data_dir)
                )
                features = convert_examples_to_features(
                    examples,
                    self.tokenizer,
                    processor,
                    max_length=args.max_seq_length,
                    label_list=self.labels,
                    output_mode=args.output_mode,
                )
                logger.info("Saving features into cached file %s", cached_features_file)
                torch.save(features, cached_features_file)
def load_and_cache_bert_example(args, tokenizer, type='train'):
    '''
    load or cache the InputExample, return dataset
    :param type:
    :param args:
    :param task_class:
    :param tokenizer:
    :return:
    '''
    task_class = processors[args.task_name]()
    # file: cached_train_bert-base-uncased_256_imdb
    file = 'normal_bert_cached_{}_{}_{}'.format(type, args.max_seq_length,
                                                args.task_name)
    cached_features_file = os.path.join(args.cache_dir, file)
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = task_class.get_labels()

        if type == 'train':
            examples = task_class.get_train_examples(args.data_dir)
        elif type == 'dev':
            examples = task_class.get_dev_examples(args.data_dir)
        else:
            examples = task_class.get_test_examples(args.data_dir)
        features = utils.convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            cls_token_at_end=bool(args.model_type in ['xlnet']),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
            # cls segment id
            pad_on_left=bool(args.model_type in ['xlnet']),
            # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)
        # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features],
                                 dtype=torch.long)
    all_lengths = torch.tensor([f.length for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_lengths, all_label_ids)
    return dataset
コード例 #6
0
def load_and_cache_examples(task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = args['output_mode']

    mode = 'dev' if evaluate else 'train'
    cached_features_file = os.path.join(
        args['data_dir'],
        f"cached_{mode}_{args['model_name']}_{args['max_seq_length']}_{task}")

    if os.path.exists(
            cached_features_file) and not args['reprocess_input_data']:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)

    else:
        logger.info("Creating features from dataset file at %s",
                    args['data_dir'])
        label_list = processor.get_labels()
        print("label list", label_list)
        examples = processor.get_dev_examples(
            args['data_dir']) if evaluate else processor.get_train_examples(
                args['data_dir'])

        features = convert_examples_to_features(
            examples,
            label_list,
            args['max_seq_length'],
            tokenizer,
            output_mode,
            cls_token_at_end=bool(args['model_type'] in ['xlnet']
                                  ),  # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0,
            pad_on_left=bool(
                args['model_type'] in ['xlnet']),  # pad on the left for xlnet
            pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0)

        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
コード例 #7
0
def load_and_cache_examples(config, task, tokenizer, evaluate=False):
    if config.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    processor = data_processors[config.task_name]()
    # Load data features from cache or dataset file
    # cached_features_file = os.path.join(config.data_dir, 'cached_{}_{}_{}_{}'.format(
    #     'dev' if evaluate else 'train',
    #     list(filter(None, 'bert-large-uncased'.split('/'))).pop(),
    #     str(config.max_seq_len),
    #     str(task)))
    # if os.path.exists(cached_features_file):
    #     logger.info("Loading features from cached file %s",
    #                 cached_features_file)
    #     features = torch.load(cached_features_file)
    # else:
    logger.info("Creating features from dataset file at %s", config.data_dir)
    label_list = processor.get_labels()
    examples = processor.get_dev_examples(
        config.data_dir) if evaluate else processor.get_train_examples(
            config.data_dir)
    features = convert_examples_to_features(
        examples,
        label_list,
        config.max_seq_len,
        tokenizer,
        "classification",
        use_entity_indicator=config.use_entity_indicator)
    # if config.local_rank in [-1, 0]:
    #     logger.info("Saving features into cached file %s",
    #                 cached_features_file)
    #     torch.save(features, cached_features_file)

    if config.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()
    output_mode = "classification"
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_e1_mask = torch.tensor([f.e1_mask for f in features],
                               dtype=torch.long)  # add e1 mask
    all_e2_mask = torch.tensor([f.e2_mask for f in features],
                               dtype=torch.long)  # add e2 mask
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids, all_e1_mask, all_e2_mask)
    return dataset
コード例 #8
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = XNLIProcessor()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.cache_dir,
        "cached_{}_beto_{}_{}_es".format(
            os.path.basename(
                args.eval_file_path if evaluate else args.train_file_path),
            str(args.max_seq_length),
            str(task),
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info(
            "Creating features from dataset file at %s",
            args.eval_file_path if evaluate else args.train_file_path,
        )
        label_list = processor.get_labels()
        examples = (processor.get_eval_examples(args.eval_file_path)
                    if evaluate else processor.get_train_examples(
                        args.train_file_path))
        features = convert_examples_to_features(
            examples,
            tokenizer,
            processor,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=False,
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0,
        )
        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.long)
    else:
        raise ValueError("No other `output_mode` for XNLI.")

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_labels)
    return dataset
コード例 #9
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(
            args.data_dir) if evaluate else processor.get_train_examples(
                args.data_dir)
        features = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            pad_on_left=bool(args.model_type in ['xlnet']),
            # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)
    elif output_mode == 'multi_label':
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
コード例 #10
0
def test_beforeafter_mask():
    exs = get_beforeafter_examples("beforeafter/examples/", num_examples=1000)
    fes = convert_examples_to_features(
        examples=exs,
        tokenizer=tokenizer,
        max_seq_length=100,
        doc_stride=128,
        mask='beforeafter')
    return exs, fes
コード例 #11
0
ファイル: run_csqa.py プロジェクト: prateeksingh0001/HyKAS
def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = myprocessors[task](args.data_dir)
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    # cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
    # 	'dev' if evaluate else 'train',
    # 	list(filter(None, args.model_name_or_path.split('/'))).pop(),
    # 	str(args.max_seq_length),
    # 	str(task)))
    # if os.path.exists(cached_features_file):
    # 	logger.info("Loading features from cached file %s", cached_features_file)
    # 	features = torch.load(cached_features_file)
    # else:
    logger.info("Creating features from dataset file at %s", args.data_dir)
    label_list = processor.get_labels()
    if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
        # HACK(label indices are swapped in RoBERTa pretrained model)
        label_list[1], label_list[2] = label_list[2], label_list[1]
    if test:
        examples = processor.get_test_examples(args.data_dir)
    else:
        examples = processor.get_dev_examples(
            args.data_dir) if evaluate else processor.get_train_examples(
                args.data_dir)
    pad_on_left = bool(args.model_type in ['xlnet'])
    pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0]
    pad_token_segment_id = 4 if args.model_type in ['xlnet'] else 0
    cls_token = tokenizer.convert_tokens_to_ids([tokenizer.cls_token])[0]
    sep_token = tokenizer.convert_tokens_to_ids([tokenizer.sep_token])[0]
    features = convert_examples_to_features(
        examples,
        tokenizer,
        label_list=label_list,
        max_length=args.max_seq_length,
        output_mode=output_mode,
        pad_on_left=pad_on_left,  # pad on the left for xlnet
        pad_token=pad_token,
        pad_token_segment_id=pad_token_segment_id,
        max_path=args.max_concepts,
        max_path_len=args.max_concept_len,
        cls_token=cls_token,
        sep_token=sep_token)
    # if args.local_rank in [-1, 0]:
    # 	logger.info("Saving features into cached file %s", cached_features_file)
    # 	torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    return MyDataset(features, pad_on_left, pad_token, pad_token_segment_id,
                     cls_token, sep_token, len(label_list))
コード例 #12
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
        'dev' if evaluate else 'train',
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length),
        str(task)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
        raw_texts = []
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        features, raw_texts = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args.model_type in ['roberta']),
            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_ext_mask = torch.tensor([f.extraction_mask for f in features], dtype=torch.float)
    all_ext_start_ids = torch.tensor([f.extraction_start_ids for f in features], dtype=torch.long)
    all_ext_end_ids = torch.tensor([f.extraction_end_ids for f in features], dtype=torch.long)
    all_aug_mask = torch.tensor([f.augmentation_mask for f in features], dtype=torch.float)
    all_aug_start_ids = torch.tensor([f.augmentation_start_ids for f in features], dtype=torch.long)
    all_aug_end_ids = torch.tensor([f.augmentation_end_ids for f in features], dtype=torch.long)

    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,
                            all_ext_mask, all_ext_start_ids, all_ext_end_ids,
                            all_aug_mask, all_aug_start_ids, all_aug_end_ids)
    return dataset, raw_texts
コード例 #13
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):

    features = convert_examples_to_features(
        task,
        tokenizer,
        max_length=args.max_seq_length,
        pad_on_left=bool(
            args.model_type in ["xlnet"]),  #pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
    )
コード例 #14
0
 def to_bert_input(self, dataset_pd):
     x_input = dataset_pd.apply(lambda x: InputExample(guid=None,
                                                       text_a=x[self.DATA_COLUMN],
                                                       text_b=x[self.DATA2_COLUMN] if self.DATA2_COLUMN else None,
                                                       label=x[self.LABEL_COLUMN]), axis=1)
     x_features = convert_examples_to_features(x_input,
                                               self.label_list,
                                               self.max_seq_length,
                                               self.tokenizer)
     x_input_ids, x_input_masks, x_segment_ids, x_labels = self.get_features(x_features)
     return (x_input_ids, x_input_masks, x_segment_ids), x_labels
コード例 #15
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    # Load data features from cache or dataset file
    if evaluate:
        cached_mode = 'dev'
    elif test:
        cached_mode = 'test'
    else:
        cached_mode = 'train'
    assert (evaluate == True and test == True) == False
    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
        cached_mode,
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length),
        str(task)))
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir)
        elif test:
            examples = processor.get_test_examples(args.data_dir)
        else:
            examples = processor.get_train_examples(args.data_dir)
        logger.info("Training number: %s", str(len(examples)))
        features = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            pad_on_left=bool(args.model_type in ['xlnet']),  # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
    all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
    all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
    all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset
コード例 #16
0
def get_features(examples):
    features = convert_examples_to_features(examples,
                                            args.tokenizer,
                                            args,
                                            stage="test")
    all_source_ids = torch.tensor(
        [f.source_ids[:args.max_source_length] for f in features],
        dtype=torch.long)
    all_source_mask = torch.tensor(
        [f.source_mask[:args.max_source_length] for f in features],
        dtype=torch.long)
    return TensorDataset(all_source_ids, all_source_mask)
コード例 #17
0
ファイル: app.py プロジェクト: ThilinaRajapakse/debunk-api
def tokenize(sentence):
    sentence = sentence.replace('\n', '')
    test_examples = [InputExample(0, sentence, None, '0')]
    label_list = ["0", "1"]

    num_labels = len(label_list)
    test_examples_len = len(test_examples)
    label_map = {label: i for i, label in enumerate(label_list)}

    test_features = convert_examples_to_features(
        test_examples,
        label_list,
        max_seq_len,
        tokenizer,
        output_mode,
        cls_token_at_end=bool(
            'model_type' == 'xlnet'),  # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=2 if 'model_type' == 'xlnet' else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=bool('model_type' == 'roberta'),
        pad_on_left=True,  # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if 'model_type' == 'xlnet' else 0)

    all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in test_features],
                                 dtype=torch.long)

    test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_label_ids)

    test_sampler = SequentialSampler(test_data)
    eval_dataloader = DataLoader(test_data,
                                 sampler=test_sampler,
                                 batch_size=eval_batch_size)

    for batch in eval_dataloader:
        batch = tuple(t for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'token_type_ids': batch[2],
            'labels': batch[3]
        }
    return inputs
コード例 #18
0
def load_and_cached_examples(args,
                             tokenizer_a,
                             tokenizer_b=None,
                             evaluate=False,
                             output_examples=False,
                             is_double=True):
    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_type.split('/'))).pop(),
            str(args.max_seq_length)))
    if os.path.exists(cached_features_file
                      ) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        examples = read_cross_examples(input_file=input_file)
        features = convert_examples_to_features(
            examples=examples,
            tokenizer_a=tokenizer_a,
            tokenizer_b=tokenizer_b,
            max_seq_length=args.max_seq_length,
            is_double=is_double)
        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_a_ids = torch.tensor([f.input_a_id for f in features],
                                   dtype=torch.long)
    all_input_b_ids = torch.tensor([f.input_b_id for f in features],
                                   dtype=torch.long)
    all_input_a_mask = torch.tensor([f.input_a_mask for f in features],
                                    dtype=torch.long)
    all_input_b_mask = torch.tensor([f.input_b_mask for f in features],
                                    dtype=torch.long)
    all_input_a_length = torch.tensor([f.input_a_length for f in features],
                                      dtype=torch.long)
    all_input_b_length = torch.tensor([f.input_b_length for f in features],
                                      dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    dataset = TensorDataset(all_input_a_ids, all_input_b_ids, all_input_a_mask,
                            all_input_b_mask, all_input_a_length,
                            all_input_b_length, all_labels)
    if output_examples:
        return dataset, examples, features
    return dataset
コード例 #19
0
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}".format(
            mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        examples = read_examples_from_file(args.data_dir, mode)
        features = convert_examples_to_features(
            examples,
            labels,
            args.max_seq_length,
            tokenizer,
            cls_token_at_end=bool(args.model_type in ["xlnet"]),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args.model_type in ["roberta"]),
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args.model_type in ["xlnet"]),
            # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
            pad_token_label_id=pad_token_label_id,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset
コード例 #20
0
    def prepare_run_examples(self):
        self.run_examples = self.processor.get_dev_examples(
            self.args.data_dir, self.args.dev_file)

        input_length_arr = []
        if self.processor.is_pair():
            truncate_seq_pair = lambda tokens_a, tokens_b, max_length: self.processor.truncate_seq_pair(
                tokens_a, tokens_b, max_length)
            self.run_features = convert_examples_to_features(
                self.run_examples,
                self.label_list,
                self.args.max_seq_length,
                self.tokenizer,
                self.output_mode,
                self.logger,
                input_length_arr,
                truncate_seq_pair=truncate_seq_pair)
        else:
            self.run_features = convert_examples_to_features(
                self.run_examples, self.label_list, self.args.max_seq_length,
                self.tokenizer, self.output_mode, self.logger,
                input_length_arr)

        all_input_ids = torch.tensor([f.input_ids for f in self.run_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor(
            [f.input_mask for f in self.run_features], dtype=torch.long)
        all_segment_ids = torch.tensor(
            [f.segment_ids for f in self.run_features], dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        self.run_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=self.args.eval_batch_size)
コード例 #21
0
    def load_and_cache_examples(self,
                                task,
                                tokenizer,
                                sentence,
                                evaluate=False):
        task = self.args['task_name']
        processor = processors[task]()
        output_mode = self.args['output_mode']
        label = '1'
        set_type = "dev"
        guid = "%s-%s" % (set_type, 0)
        text_a = sentence
        examples = []
        examples.append(
            InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        label_list = processor.get_labels()
        features = convert_examples_to_features(
            examples,
            label_list,
            self.args['max_seq_length'],
            tokenizer,
            output_mode,
            cls_token_at_end=bool(self.args['model_type'] in ['xlnet']
                                  ),  # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2
            if self.args['model_type'] in ['xlnet'] else 0,
            pad_on_left=bool(self.args['model_type'] in
                             ['xlnet']),  # pad on the left for xlnet
            pad_token_segment_id=4
            if self.args['model_type'] in ['xlnet'] else 0)

        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.float)

        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)
        return dataset
コード例 #22
0
def matres_train_examples(tokenizer,
                          lm='roberta',
                          mask_events=False,
                          mask_context=False):
    train_examples, _ = matres_examples()

    train_examples, train_features = convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=MAX_SEQ_LENGTH,
        doc_stride=DOC_STRIDE,
        mask_events=mask_events,
        mask_context=mask_context)
    train_data = make_tensor_dataset(train_features, model=lm)
    return train_examples, train_data, train_features
コード例 #23
0
def load_and_cache_examples(args, tokenizer, processor, mode):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache\

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}".format(
            mode,
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length)),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        examples = processor.get_train_examples(args.data_dir) if mode=='train' else \
            processor.get_dev_examples(args.data_dir)
        features = convert_examples_to_features(
            examples,
            tokenizer,
            processor,
            args.max_seq_length,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.attention_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.token_type_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
コード例 #24
0
def matres_dev_examples(tokenizer,
                        lm='roberta',
                        mask_events=False,
                        mask_context=False):
    _, dev_examples = matres_examples()

    dev_examples, dev_features = convert_examples_to_features(
        examples=dev_examples,
        tokenizer=tokenizer,
        max_seq_length=MAX_SEQ_LENGTH,
        doc_stride=DOC_STRIDE,
        mask_events=mask_events,
        mask_context=mask_context,
        id_prefix="md")
    dev_data = make_tensor_dataset(dev_features, model=lm)
    return dev_examples, dev_data
コード例 #25
0
def udst(tokenizer,
         lm='roberta',
         split="train",
         example_dir="udst/all_annotations/",
         mask_events=False,
         mask_context=False):
    exs = parse_udst.get_examples(example_dir=example_dir, split=split)
    exs, feats = convert_examples_to_features(examples=exs,
                                              tokenizer=tokenizer,
                                              max_seq_length=MAX_SEQ_LENGTH,
                                              doc_stride=DOC_STRIDE,
                                              mask=False,
                                              mask_events=mask_events,
                                              mask_context=mask_context)
    data = make_tensor_dataset(feats, model=lm)
    return exs, data, feats
コード例 #26
0
def load_and_cache_examples(task, tokenizer, evaluate=False, input_file="train"):
    processor = processors[task]()
    output_mode = args['output_mode']

    mode = 'dev' if evaluate else 'train'
    cached_features_file = os.path.join(args['data_dir'],
                                        f"cached_{mode}_{args['model_name']}_{args['max_seq_length']}_{task}")

    if os.path.exists(cached_features_file) and not args['reprocess_input_data']:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)

    else:
        logger.info("Creating features from dataset file at %s", args['data_dir'])
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(args['data_dir'],
                                              input_file) if evaluate else processor.get_train_examples(
            args['data_dir'], input_file)  # teghub

        if __name__ == "__main__":
            features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer,
                                                    output_mode,
                                                    cls_token_at_end=bool(args['model_type'] in ['xlnet']),
                                                    # xlnet has a cls token at the end
                                                    cls_token=tokenizer.cls_token,
                                                    cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0,
                                                    sep_token=tokenizer.sep_token,
                                                    sep_token_extra=bool(args['model_type'] in ['roberta']),
                                                    # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                                    pad_on_left=bool(args['model_type'] in ['xlnet']),
                                                    # pad on the left for xlnet
                                                    pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                                    pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0)

        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset
コード例 #27
0
def distant_test_examples(tokenizer,
                          lm='roberta',
                          train=False,
                          mask=False,
                          mask_events=False):
    f = open('timex/orig/test_exs.pkl', 'rb')
    exs = pickle.load(f)
    if mask:
        mask = 'distant'
    exs, feats = convert_examples_to_features(examples=exs,
                                              tokenizer=tokenizer,
                                              max_seq_length=MAX_SEQ_LENGTH,
                                              doc_stride=DOC_STRIDE,
                                              mask=mask,
                                              mask_events=mask_events)
    data = make_tensor_dataset(feats, model=lm)
    return exs, data
コード例 #28
0
def udst_majority(tokenizer,
                  lm='roberta',
                  example_dir="udst/all_annotations/",
                  split="dev",
                  mask_events=False,
                  ties=True):
    exs = parse_udst.get_majority_examples(example_dir=example_dir,
                                           split=split,
                                           ties=ties)
    exs, feats = convert_examples_to_features(examples=exs,
                                              tokenizer=tokenizer,
                                              max_seq_length=MAX_SEQ_LENGTH,
                                              doc_stride=DOC_STRIDE,
                                              mask=False,
                                              mask_events=mask_events)
    data = make_tensor_dataset(feats, model=lm)
    return exs, data
コード例 #29
0
def matres_test_examples(tokenizer,
                         lm='roberta',
                         mask_events=False,
                         mask_context=False):
    loader = MatresLoader()
    examples = loader.read_test_examples(doc_dir="timebank/te3-platinum/",
                                         rel_dir="timebank/MATRES/")

    examples, features = convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=MAX_SEQ_LENGTH,
        doc_stride=DOC_STRIDE,
        mask_events=mask_events,
        mask_context=mask_context,
        id_prefix="mt")
    data = make_tensor_dataset(features, model=lm)
    return examples, data
コード例 #30
0
def load_and_cache_test_samples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}'.format(
            'test',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = processor.get_test_examples(args.data_dir)
        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            # pad on the left for xlnet
            pad_on_left=bool(args.model_type in ['xlnet']),
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
            predict=True)
        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids)
    return dataset