def load_dataset(main_file, args, processor, tokenizer, output_mode, data_type = None):
     # Prepare data loader
    id_file     = main_file + "_ids_nocls.pt"
    mask_file   = main_file + "_mask_nocls.pt"
    label_file  = main_file + "_labels.pt"

    #case1 tensor files exist
    file_exist_count = sum([isfile(id_file), isfile(label_file), isfile(mask_file)])
    if  0 < file_exist_count  < 3:
        exit("Only part of the data is saved as tensor files. Delete those files and try again.")
    elif file_exist_count == 3:
        #import pdb; pdb.set_trace()
        all_input_ids = torch.load(id_file)
        all_masks     = torch.load(mask_file)
        all_label_ids = torch.load(label_file)
        return TensorDataset(all_input_ids, all_masks, all_label_ids)

    #load the mega object 
    if data_type == "train":
        features = convert_examples_to_features(
            processor.get_train_examples(args.data_dir), 
            processor.get_labels(), 
            tokenizer,
            args.max_tokens)
    elif data_type == "test":
        features = convert_examples_to_features(
            processor.get_dev_examples(args.data_dir), 
            processor.get_labels(), 
            tokenizer,
            args.max_tokens)
    elif data_type == "val":
        features = convert_examples_to_features(
            processor.get_val_examples(args.data_dir), 
            processor.get_labels(), 
            tokenizer,
            args.max_tokens)
    else:
        exit(f"invalid data_type {data_type}")

    #parse it carefully into tensor files
    torch.save(torch.tensor([f.input_ids    for f in features], dtype=torch.long), id_file)
    torch.save(torch.tensor([f.input_mask   for f in features], dtype=torch.long), mask_file)
    torch.save(torch.tensor([f.label_id     for f in features], dtype=torch.long), label_file)
    #call this function again!
    return load_dataset(main_file, args, processor, tokenizer, output_mode, data_type=data_type)
def run(data):
    data_loaded = json.loads(data)
    input_sentence = data_loaded["input"]
    input_example = ("PREDICT_0", input_sentence, "None")
    max_seq_len = configs["max_sequence_length"]
    feats = convert_examples_to_features([input_example], classes_list, max_seq_len, tokenizer)[0]
    feats = [torch.tensor(x).unsqueeze(0) for x in feats]
    model_out = model(input_ids=feats[0], attention_mask=feats[1], token_type_ids=feats[2], class_label_ids=None, input_ids_masked=feats[4])
    logits_softmaxed = torch.nn.functional.softmax(model_out[1][0], dim=-1).detach().cpu().numpy()
    return {"country_prediction": str(classes_list[np.argmax(logits_softmaxed)]), "province_prediction": "None"}
Ejemplo n.º 3
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(
            args.data_dir) if evaluate else processor.get_train_examples(
                args.data_dir)
        features = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']
                                  ),  # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
            pad_on_left=bool(
                args.model_type in ['xlnet']),  # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
def load_and_cache_examples(args, folder, task, tokenizer):
    processor = processors[task]()
    output_mode = output_modes[task]

    cached_features_file = os.path.join(
        folder, 'cached_{}_{}_{}_{}_{}'.format(
            'test',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task),
            str(args.similarity_threshold)))

    if os.path.exists(cached_features_file):
        #if os.path.exists(cached_features_file):
        features = torch.load(cached_features_file)
    else:
        label_list = processor.get_labels()
        examples = torch.load(folder + '/example')
        features = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']
                                  ),  # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
            pad_on_left=bool(
                args.model_type in ['xlnet']),  # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        if args.local_rank in [-1, 0]:
            #logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
Ejemplo n.º 5
0
def return_logits(in_example):
    input_example = ("PREDICT_0", in_example, "None")
    max_seq_len = configs["max_sequence_length"]
    feats = convert_examples_to_features([input_example], classes_list,
                                         max_seq_len, tokenizer)[0]
    feats = [torch.tensor(x).unsqueeze(0) for x in feats]
    model_out = model(input_ids=feats[0],
                      attention_mask=feats[1],
                      token_type_ids=feats[2],
                      class_label_ids=None,
                      input_ids_masked=feats[4])
    logits_softmaxed = torch.nn.functional.softmax(
        model_out[1][0], dim=-1).detach().cpu().numpy()
    return logits_softmaxed
Ejemplo n.º 6
0
def load_and_cache_examples_randomized(args,
                                       task,
                                       tokenizer,
                                       random_smooth,
                                       epoch,
                                       evaluate=False):
    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task),
            str(args.similarity_threshold)))

    ##############################################################################
    if os.path.exists(cached_features_file + '_' + str(epoch)):
        ##############################################################################
        print('Randomize dataset: cached features exists')
        logger.info("Loading features from cached file %s",
                    cached_features_file + '_' + str(epoch))
        features = torch.load(cached_features_file + '_' + str(epoch))

    else:
        print('Randomize dataset: cached features NOT exists')
        if os.path.exists(cached_features_file + '_example'):
            print('Randomize dataset: cached examples exists')
            examples = torch.load(cached_features_file + '_example')
        else:
            print('Randomize dataset: cached examples NOT exists')
            logger.info("Creating features from dataset file at %s",
                        args.data_dir)
            examples = processor.get_dev_examples(
                args.data_dir) if evaluate else processor.get_train_examples(
                    args.data_dir)

            # save examples before perturbe it
            logger.info("Saving examples into cached file %s",
                        cached_features_file + '_example')
            torch.save(examples, cached_features_file + '_example')

        for example in examples:
            if example.text_a:
                example.text_a = str(
                    random_smooth.get_perturbed_batch(
                        np.array([[example.text_a]]))[0][0])
            if example.text_b:
                example.text_b = str(
                    random_smooth.get_perturbed_batch(
                        np.array([[example.text_b]]))[0][0])

        label_list = processor.get_labels()
        features = convert_examples_to_features(
            examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=bool(args.model_type in ['xlnet']
                                  ),  # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
            pad_on_left=bool(
                args.model_type in ['xlnet']),  # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        if args.local_rank in [-1, 0]:
            #logger.info("Saving examples into cached file %s", cached_features_file + '_example')
            #torch.save(examples, cached_features_file + '_example')
            logger.info("Saving features into cached file %s",
                        cached_features_file + '_' + str(epoch))
            torch.save(features, cached_features_file + '_' + str(epoch))

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset