Beispiel #1
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False, TEST=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    if TEST:
        cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
            'test',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length),
            str(task)))
    else:
        cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length),
            str(task)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1] 
        if TEST:
            examples = processor.get_test_examples(args.data_dir)
        else:
            examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset
Beispiel #2
0
    def get_training_dataset(self,
                             train_batch_size,
                             dataset_folder="/home/phillab/glue_data/CoLA"):
        cola_proc = utils_glue.ColaProcessor()
        label_list = cola_proc.get_labels()
        max_seq_length = 400

        examples = cola_proc.get_train_examples(dataset_folder)
        features = utils_glue.convert_examples_to_features(
            examples, label_list, max_seq_length, self.tokenizer,
            "classification")
        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)

        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)
        dataloader = DataLoader(dataset=dataset,
                                batch_size=train_batch_size,
                                sampler=RandomSampler(dataset),
                                drop_last=True)
        return dataloader
def load_examples(contents, max_seq_length, tokenizer, label_list):
    """
    :param contents:  eg: [('苹果很好用', '苹果')]
    :param max_seq_length:
    :param tokenizer:  初始化后的tokenizer
    :param label_list:
    :return:
    """
    examples = []
    for guid, content in enumerate(contents):
        sentence, aspect = content
        examples.append(InputExample(guid=guid, text_a=sentence,
                                     text_b=aspect))
    features = convert_examples_to_features(examples,
                                            label_list,
                                            max_seq_length,
                                            tokenizer,
                                            output_mode="classification",
                                            cls_token_segment_id=0,
                                            pad_token_segment_id=0)
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
    return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False):

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task)))

    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)

    else:

        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()

        examples = processor.get_dev_examples(
            args.data_dir) if evaluate else processor.get_train_examples(
                args.data_dir)

        features = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=False,  # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            False,  # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=False,  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0,
        )

        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features],
                                 dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
Beispiel #5
0
def load_and_cache_examples(args,
                            task,
                            tokenizer,
                            evaluate=False,
                            is_aux=False):
    """
    :param args:
    :param task:  eg : 任务名称
    :param tokenizer:  加载好的tokenizer
    :param evaluate: bool,是否是评估数据集还是训练集 True: 代表加载dev.tsv,
    :param is_aux: bool
    :return:
    """
    if is_aux:
        data_dir = args.aux_data_dir
    else:
        data_dir = args.data_dir
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    # 如果需要在数据处理阶段就进行数据处理的截取,那么需要传入数据的最大长度
    processor.max_seq_length = args.max_seq_length
    cached_features_file = os.path.join(
        data_dir, 'cached_{}_{}_{}'.format('dev' if evaluate else 'train',
                                           str(args.max_seq_length),
                                           str(task)))
    if os.path.exists(cached_features_file):
        logger.info("从缓存加载features %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("没有发现缓存features,根据datafile生成features %s", data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(
            data_dir) if evaluate else processor.get_train_examples(data_dir)
        features = convert_examples_to_features(examples,
                                                label_list,
                                                args.max_seq_length,
                                                tokenizer,
                                                output_mode,
                                                cls_token_segment_id=0,
                                                pad_token_segment_id=0)
        if args.local_rank in [-1, 0]:
            logger.info("保存feature到缓存文件 %s", cached_features_file)
            torch.save(features, cached_features_file)
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
Beispiel #6
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}'.format('dev' if evaluate else 'train'))
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        # logger.info('Loading features from cached file {}'.format(cached_features_file))
        features = torch.load(cached_features_file)
    else:
        # logger.info('Creating features from dataset file at {}'.format(args.data_dir))
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(
            args.data_dir) if evaluate else processor.get_train_examples(
                args.data_dir)

        ##this is where you double reverse
        if evaluate:
            examples_reverse = []
        else:
            examples_reverse = [
                InputExample(pid=example.pid,
                             text_a=example.text_b,
                             text_b=example.text_a,
                             aug_a=example.aug_a,
                             aug_b=example.aug_b,
                             label=example.label) for example in examples
            ]
        features = convert_examples_to_features(examples + examples_reverse,
                                                label_list,
                                                args.max_seq_length,
                                                tokenizer,
                                                output_mode,
                                                cls_token_at_end=True,
                                                cls_token=tokenizer.cls_token,
                                                sep_token=tokenizer.sep_token,
                                                cls_token_segment_id=2,
                                                pad_on_left=True,
                                                pad_token_segment_id=4,
                                                augment=args.augment)
        # logger.info('Saving features into cached file {}'.format(cached_features_file))
        torch.save(features, cached_features_file)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)
    all_pids = torch.tensor([int(f.pid) for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids, all_pids)
    return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
            str(task),
        ),
    )
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = (processor.get_dev_examples(args.data_dir) if evaluate else
                    processor.get_train_examples(args.data_dir))
        features = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=bool(args.model_type in ["xlnet"]
                                  ),  # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 1,
            pad_on_left=bool(
                args.model_type in ["xlnet"]),  # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
def load_predict_dataset(dataframe, args, task, tokenizer, evaluate=False):
    """


    :param dataframe: dataframe containing columns like those specified in train.csv and dev.csv
    :param args:
    :param task:
    :param tokenizer:
    :param evaluate:
    :return: an instance of TensorDataset
    """
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    output_mode = output_modes[task]

    logger.info("Creating features from dataframe size={0}".format(len(dataframe)))
    label_list = processor.get_labels()

    logger.info('label_list={0}'.format(label_list))

    if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
        # HACK(label indices are swapped in RoBERTa pretrained model)
        label_list[1], label_list[2] = label_list[2], label_list[1]

    examples = processor.get_test_examples(dataframe)

    logger.info('Examples={0}'.format(examples))

    features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
        cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
    )

    logger.info('Features={}'.format(features))

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset
Beispiel #9
0
def load_examples(contents, max_seq_length, tokenizer, label_list,
                  prefix_name):
    """
    :param contents:  eg: [('苹果很好用', '苹果')]  或者 [('苹果很好用', '苹果', '积极')]
    :param max_seq_length:
    :param tokenizer:  初始化后的tokenizer
    :param label_list:
    :return:
    """
    examples = []
    for guid, content in enumerate(contents):
        if len(content) == 2:
            #表示只有sentence和aspect关键字,用于预测
            sentence, aspect = content
            sentence = prefix_name + sentence
            examples.append(
                InputExample(guid=guid, text_a=sentence, text_b=aspect))
        elif len(content) == 3:
            # 表示内容是sentence和aspect关键字和label,一般用于训练
            sentence, aspect, label = content
            sentence = prefix_name + sentence
            examples.append(
                InputExample(guid=guid,
                             text_a=sentence,
                             text_b=aspect,
                             label=label))
        elif len(content) == 5:
            # 表示内容是sentence和aspect关键字和label,一般用于训练, start和end是aspect的位置信息
            sentence, aspect, start, end, label = content
            sentence = prefix_name + sentence
            examples.append(
                InputExample(guid=guid,
                             text_a=sentence,
                             text_b=aspect,
                             label=label))
        else:
            print(f"这条数据有问题,过滤掉: {guid}: {content}")
    features = convert_examples_to_features(examples,
                                            label_list,
                                            max_seq_length,
                                            tokenizer,
                                            output_mode="classification",
                                            cls_token_segment_id=0,
                                            pad_token_segment_id=0)
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features],
                                 dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
def load_and_cache_examples_line_list(lines_list,
                                      args,
                                      task,
                                      tokenizer,
                                      evaluate=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    label_list = processor.get_labels()
    if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
        # HACK(label indices are swapped in RoBERTa pretrained model)
        label_list[1], label_list[2] = label_list[2], label_list[1]
    examples = processor.get_predict_examples(lines_list=lines_list)
    # examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
    features = convert_examples_to_features(
        examples,
        label_list,
        args.max_seq_length,
        tokenizer,
        output_mode,
        cls_token_at_end=bool(args.model_type in ['xlnet']),
        # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=bool(args.model_type in ['roberta']),
        # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=bool(
            args.model_type in ['xlnet']),  # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
    )

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
Beispiel #11
0
def load_and_cache_examples(args,
                            task,
                            tokenizer,
                            evaluate=False,
                            is_aux=False):
    if is_aux:
        data_dir = args.aux_data_dir
    else:
        data_dir = args.data_dir
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        data_dir, 'cached_{}_{}_{}'.format('dev' if evaluate else 'train',
                                           str(args.max_seq_length),
                                           str(task)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(
            data_dir) if evaluate else processor.get_train_examples(data_dir)
        features = convert_examples_to_features(examples,
                                                label_list,
                                                args.max_seq_length,
                                                tokenizer,
                                                output_mode,
                                                cls_token_segment_id=0,
                                                pad_token_segment_id=0)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
Beispiel #12
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    def transform_examples_to_hr(exmpls):
        examples_hr = [
            '[CLS] ' + exp.text_a + ' [SEP] ' + exp.text_b + ' [LABEL] ' +
            exp.label for exp in exmpls
        ]
        return examples_hr

    processor = processors[task]()
    output_mode = output_modes[task]

    examples = None
    tokenized_examples = None

    logger.info("Creating features from dataset file at %s", args.data_dir)
    label_list = processor.get_labels()
    examples = processor.get_dev_examples(
        args.data_dir) if evaluate else processor.get_train_examples(
            args.data_dir)
    features, tokenized_examples = convert_examples_to_features(
        examples,
        label_list,
        args.max_seq_length,
        tokenizer,
        output_mode,
        cls_token_at_end=bool(
            args.model_type in ['xlnet']),  # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        sep_token=tokenizer.sep_token,
        cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
        pad_on_left=bool(
            args.model_type in ['xlnet']),  # pad on the left for xlnet
        pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset, transform_examples_to_hr(examples)
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    
    processor = None
    if(args.task_name == "multilabel"):
        processor = processors[task](args.data_dir)
    else: 
        processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
        'dev' if evaluate else 'train',
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length),
        str(task)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.float)
    elif output_mode == "multi-classification":
        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset
def load_examples(processor, list_examples, tokenizer, LABELS=False):
    output_mode = "classification"
    # Load data features from the list of provided examples
    label_list = processor.get_labels()
    examples = processor.get_examples(list_examples, "test", LABELS)
    max_seq_length = 128
    model_type = "bert"
    features = convert_examples_to_features(
        examples,
        label_list,
        max_seq_length,
        tokenizer,
        output_mode,
        cls_token_at_end=bool(
            model_type in ['xlnet']),  # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=2 if model_type in ['xlnet'] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=bool(
            model_type in ['roberta']
        ),  # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=bool(model_type in ['xlnet']),  # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if model_type in ['xlnet'] else 0,
    )

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if LABELS:
        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.float)

        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)
    else:
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
    return dataset
Beispiel #15
0
def load_and_cache_pred_examples(args, task, tokenizer):
    processor = processors[task]()
    output_mode = output_modes[task]

    logger.info("Creating features from dataset file at %s", args.data_dir)
    label_list = processor.get_labels()
    if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
        # HACK(label indices are swapped in RoBERTa pretrained model)
        label_list[1], label_list[2] = label_list[2], label_list[1]
    examples = processor.get_predict_samples(args.data_dir)
    features = convert_examples_to_features(
        examples,
        label_list,
        args.max_seq_length,
        tokenizer,
        output_mode,
        cls_token_at_end=bool(args.model_type in ['xlnet']),
        # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=bool(args.model_type in ['roberta']),
        # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=bool(
            args.model_type in ['xlnet']),  # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
    )

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
def load_and_cache_examples(args,
                            task,
                            tokenizer,
                            input1,
                            input2,
                            evaluate=False):

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    label_list = processor.get_labels()

    examples = processor._create_example_for_test(input1, input2)

    features = convert_examples_to_features(
        examples,
        label_list,
        args["max_seq_length"],
        tokenizer,
        output_mode,
        cls_token_at_end=False,
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=False,
        pad_on_left=False,
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=0,
    )

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)

    return all_input_ids, all_input_mask, all_segment_ids
Beispiel #17
0
def load_and_cache_examples(args, task, tokenizer, set_type='train'):
    processor = processors[task]()
    output_mode = output_modes[task]
    is_multi_choice = True if output_mode == 'multi-choice' else False
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(args.data_dir[task], 'cached_{}_{}_{}_{}'.format(
        set_type,
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(MAX_SEQ_LENGTHS[task]),
        str(task)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir[task])
        label_list = processor.get_labels()
        if set_type == 'train':
            examples = processor.get_train_examples(args.data_dir[task])
        elif set_type == 'dev':
            examples = processor.get_dev_examples(args.data_dir[task])
        else:
            examples = processor.get_test_examples(args.data_dir[task])
        features = convert_examples_to_features(examples, label_list, MAX_SEQ_LENGTHS[task], tokenizer, output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
            do_lower_case=args.do_lower_case,
            is_multi_choice=is_multi_choice)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    dataset = convert_features_to_tensors(features, output_mode, is_multi_choice=is_multi_choice)

    return dataset
Beispiel #18
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    data_dir = args.data_dir

    logger.info("Creating features from dataset file at %s", data_dir)
    label_list = processor.get_labels()
    examples = processor.get_dev_examples(
        data_dir) if evaluate else processor.get_train_examples(data_dir)
    features = convert_examples_to_features(examples,
                                            label_list,
                                            args.max_seq_length,
                                            tokenizer,
                                            output_mode,
                                            cls_token_at_end=False,
                                            cls_token=tokenizer.cls_token,
                                            sep_token=tokenizer.sep_token,
                                            cls_token_segment_id=1,
                                            pad_on_left=False,
                                            pad_token_segment_id=0)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
Beispiel #19
0
def load_and_cache_examples(args, task, tokenizer, type):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}'.format(
            type,
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()

        docs = defaultdict(dict)
        reader = jsonlines.Reader(
            open(os.path.join(args.data_dir, 'docs.jsonl')))
        for line in reader:
            docs[line["docid"]] = line["document"]

        if type == 'train':
            examples = processor.get_train_examples(args.data_dir, docs)
        elif type == 'dev':
            examples = processor.get_dev_examples(args.data_dir, docs)
        else:
            examples = processor.get_test_examples(args.data_dir, docs)
        features = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args.model_type in ['roberta']),
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args.model_type in ['xlnet']),
            # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
        )
        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
Beispiel #20
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    def transform_examples_to_hr(exmpls):
        examples_hr = [
            '[CLS] ' + exp.text_a + ' [SEP] ' + exp.text_b + ' [LABEL] ' +
            exp.label for exp in exmpls
        ]
        return examples_hr

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task)))
    examples = None
    tokenized_examples = None
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)

        # tmp code for writing out the textual error analysis; comment for training!
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(
            args.data_dir) if evaluate else processor.get_train_examples(
                args.data_dir)
        features, tokenized_examples = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
            pad_on_left=bool(args.model_type in ['xlnet']),
            # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)

        # end tmp
        # features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(
            args.data_dir) if evaluate else processor.get_train_examples(
                args.data_dir)
        features, tokenized_examples = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']
                                  ),  # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
            pad_on_left=bool(
                args.model_type in ['xlnet']),  # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset, transform_examples_to_hr(
        examples
    )  # tokenized_examples, examples, all_label_ids,   # transform_examples_to_hr(examples)
Beispiel #21
0
def load_and_cache_examples(args,
                            task,
                            tokenizer,
                            evaluate=False,
                            dev_evaluate=False):
    data_dir = task_to_data_dir[task]

    if task.startswith("fever"):
        processor = processors["fever"]()
    elif task in nli_task_names:
        processor = processors["nli"](data_dir)
    elif task in ["mnli"]:
        processor = processors["mnli"](hans=args.hans)
    elif task == "mnli-mm":
        processor = processors["mnli-mm"](hans=args.hans)
    elif task.startswith("HANS"):
        processor = processors["hans"](hans=args.hans)
    else:
        processor = processors[task]()

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        data_dir, 'cached_{}_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task)))
    print("File is: ", cached_features_file)

    if False:  #os.path.exists(cached_features_file) and args.use_cached_dataset:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", data_dir)
        label_list = processor.get_labels()
        if dev_evaluate:  # and task in nli_task_names:
            examples = processor.get_validation_dev_examples(data_dir)
        else:
            examples = processor.get_dev_examples(data_dir) if evaluate else\
                processor.get_train_examples(data_dir)


        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, "classification",
            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
            rubi=args.rubi or args.hypothesis_only or args.focal_loss or args.poe_loss or args.hans_only,
            rubi_text=args.rubi_text, hans=(args.hans and not evaluate) or args.hans_only,\
            hans_features=args.hans_features)

        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features],
                                 dtype=torch.long)

    if (args.hans and not evaluate) or args.hans_only:
        all_h_ids = torch.tensor([f.h_ids for f in features], dtype=torch.long)
        all_h_masks = torch.tensor([f.input_mask_h for f in features],
                                   dtype=torch.long)
        all_p_ids = torch.tensor([f.p_ids for f in features], dtype=torch.long)
        all_p_masks = torch.tensor([f.input_mask_p for f in features],
                                   dtype=torch.long)
        all_have_overlap = torch.tensor([f.have_overlap for f in features],
                                        dtype=torch.float)
        all_overlap_rate = torch.tensor([f.overlap_rate for f in features],
                                        dtype=torch.float)
        all_subsequence = torch.tensor([f.subsequence for f in features],
                                       dtype=torch.float)
        all_constituent = torch.tensor([f.constituent for f in features],
                                       dtype=torch.float)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,\
                all_h_ids, all_h_masks, all_p_ids, all_p_masks, all_have_overlap, all_overlap_rate,\
                all_subsequence, all_constituent)

    elif args.rubi or args.hypothesis_only or args.focal_loss or args.poe_loss:
        # Hypothesis representations.
        all_h_ids = torch.tensor([f.h_ids for f in features], dtype=torch.long)
        all_h_masks = torch.tensor([f.input_mask_h for f in features],
                                   dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,\
            all_h_ids, all_h_masks)
    else:
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)

    return dataset, processor.get_labels(), processor.num_classes