tokenizer,
		label_list=label_list,
		max_length=args.max_seq_length,
		output_mode=output_mode
	)

	# Convert to Tensors and build dataset
	all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
	all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
	all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
	if output_mode == "classification":
		all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
	elif output_mode == "regression":
		all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

	dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
	#dataset = TensorDataset(all_input_ids[0:data_size], all_attention_mask[0:data_size], all_token_type_ids[0:data_size], all_labels[0:data_size])
	return dataset, all_labels


def main():
	args = parse_args()

	# Setup logging
	logging.basicConfig(
		format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
		datefmt="%m/%d/%Y %H:%M:%S",
		level=logging.INFO,
	)
	
	# multi-gpu training (should be after apex fp16 initialization)
def load_and_cache_examples(args, task, tokenizer, evaluate=False):

    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
            str(task),
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]:
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        examples = (
            processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        )

        if args.model_type == 'gpt2': #setting pad token for GPT-2
            tokenizer.pad_token = '[PAD]'

        if args.sst5:
            label_list = ['0','1','2','3','4']

        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset
Example #3
0
def load_and_cache_examples(args,
                            task,
                            tokenizer,
                            desc_tokenizer,
                            evaluate=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task)))
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        examples = processor.get_dev_examples(
            args.data_dir) if evaluate else processor.get_train_examples(
                args.data_dir)
        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=bool(
                args.model_type in ['xlnet']),  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    # Drug Description
    desc_max_seq_length = args.desc_max_seq_length
    desc_processor = processors['desc']()
    output_mode = output_modes[task]
    # For Drug1
    # Load data features from cache or dataset file
    all_desc_features = []
    for drug_indx in (1, 2):
        cached_desc_features_file = os.path.join(
            args.data_dir, 'cached_desc{}_{}_{}_{}_{}'.format(
                drug_indx, 'dev' if evaluate else 'train',
                list(filter(None, args.model_name_or_path.split('/'))).pop(),
                str(desc_max_seq_length), str(task)))
        if os.path.exists(
                cached_desc_features_file) and not args.overwrite_cache:
            logger.info(
                "Loading description of drug%s features from cached file %s",
                drug_indx, cached_desc_features_file)
            desc_features = torch.load(cached_desc_features_file)
        else:
            logger.info(
                "Creating description of drug%s features from dataset file at %s",
                drug_indx, args.data_dir)
            label_list = desc_processor.get_labels()
            if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
                # HACK(label indices are swapped in RoBERTa pretrained model)
                label_list[1], label_list[2] = label_list[2], label_list[1]
            desc_examples = desc_processor.get_dev_examples(
                args.data_dir,
                drug_indx) if evaluate else desc_processor.get_train_examples(
                    args.data_dir, drug_indx)
            desc_features = convert_examples_to_features(
                desc_examples,
                desc_tokenizer,
                label_list=label_list,
                max_length=desc_max_seq_length,
                output_mode=output_mode,
                pad_on_left=bool(
                    args.model_type in ['xlnet']),  # pad on the left for xlnet
                pad_token=tokenizer.convert_tokens_to_ids(
                    [tokenizer.pad_token])[0],
                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
            )
            if args.local_rank in [-1, 0]:
                logger.info(
                    "Saving description of drug%s features into cached file %s",
                    drug_indx, cached_desc_features_file)
                torch.save(desc_features, cached_desc_features_file)
        all_desc_features.append(desc_features)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Get Position index
    drug_id = tokenizer.vocab['drug']
    one_id = tokenizer.vocab['##1']
    two_id = tokenizer.vocab['##2']

    all_input_ids = [f.input_ids for f in features]
    all_entity1_pos = []
    all_entity2_pos = []
    for input_ids in all_input_ids:
        entity1_pos = args.max_seq_length - 1
        entity2_pos = args.max_seq_length - 1
        for i in range(args.max_seq_length):
            if input_ids[i] == drug_id and input_ids[i + 1] == one_id:
                entity1_pos = i
            if input_ids[i] == drug_id and input_ids[i + 1] == two_id:
                entity2_pos = i
        all_entity1_pos.append(entity1_pos)
        all_entity2_pos.append(entity2_pos)
    assert len(all_input_ids) == len(all_entity1_pos) == len(all_entity2_pos)

    range_list = list(range(args.max_seq_length, 2 * args.max_seq_length))
    all_relative_dist1 = torch.tensor([[x - e1 for x in range_list]
                                       for e1 in all_entity1_pos],
                                      dtype=torch.long)
    all_relative_dist2 = torch.tensor([[x - e2 for x in range_list]
                                       for e2 in all_entity2_pos],
                                      dtype=torch.long)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.float)

    all_desc1_ii = torch.tensor([f.input_ids for f in all_desc_features[0]],
                                dtype=torch.long)
    all_desc1_am = torch.tensor(
        [f.attention_mask for f in all_desc_features[0]], dtype=torch.long)
    all_desc1_tti = torch.tensor(
        [f.token_type_ids for f in all_desc_features[0]], dtype=torch.long)
    all_desc2_ii = torch.tensor([f.input_ids for f in all_desc_features[1]],
                                dtype=torch.long)
    all_desc2_am = torch.tensor(
        [f.attention_mask for f in all_desc_features[1]], dtype=torch.long)
    all_desc2_tti = torch.tensor(
        [f.token_type_ids for f in all_desc_features[1]], dtype=torch.long)

    # Fingerprint
    fingerprint_indices = torch.tensor(list(range(len(features))),
                                       dtype=torch.long)

    #dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    #dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_relative_dist1, all_relative_dist2, all_labels)
    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_relative_dist1,
                            all_relative_dist2, all_desc1_ii, all_desc1_am,
                            all_desc1_tti, all_desc2_ii, all_desc2_am,
                            all_desc2_tti, fingerprint_indices, all_labels)
    return dataset
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--meta_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--predict_eval",
                        action='store_true',
                        help="Whether to predict eval set.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--eval_steps", default=-1, type=int, help="")
    parser.add_argument("--lstm_hidden_size", default=300, type=int, help="")
    parser.add_argument("--lstm_layers", default=2, type=int, help="")
    parser.add_argument("--lstm_dropout", default=0.5, type=float, help="")

    parser.add_argument("--train_steps", default=-1, type=int, help="")
    parser.add_argument("--report_steps", default=-1, type=int, help="")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--split_num", default=3, type=int, help="text split")
    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument("--freeze",
                        default=0,
                        type=int,
                        required=False,
                        help="freeze bert.")
    parser.add_argument("--not_do_eval_steps",
                        default=0.35,
                        type=float,
                        help="not_do_eval_steps.")
    args = parser.parse_args()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    try:
        os.makedirs(args.output_dir)
    except:
        pass

    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=args.do_lower_case)

    config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3)

    # Prepare model
    model = BertForSequenceClassification_last2embedding_cls.from_pretrained(
        args.model_name_or_path, args, config=config)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    if args.do_train:

        # Prepare data loader

        train_examples = read_examples(os.path.join(args.data_dir,
                                                    'train.csv'),
                                       is_training=True)
        train_features = convert_examples_to_features(train_examples,
                                                      tokenizer,
                                                      args.max_seq_length,
                                                      args.split_num, True)
        all_input_ids = torch.tensor(select_field(train_features, 'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(select_field(train_features,
                                                   'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(train_features,
                                                    'segment_ids'),
                                       dtype=torch.long)
        all_label = torch.tensor([f.label for f in train_features],
                                 dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size //
                                      args.gradient_accumulation_steps)

        num_train_optimization_steps = args.train_steps

        # Prepare optimizer

        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=args.train_steps //
                                         args.gradient_accumulation_steps)

        global_step = 0

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        best_acc = 0
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        bar = tqdm(range(num_train_optimization_steps),
                   total=num_train_optimization_steps)
        train_dataloader = cycle(train_dataloader)

        # 先做一个eval
        for file in ['dev.csv']:
            inference_labels = []
            gold_labels = []
            inference_logits = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=True)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)

            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)

            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    tmp_eval_loss, logits = model(input_ids=input_ids,
                                                  token_type_ids=segment_ids,
                                                  attention_mask=input_mask,
                                                  labels=label_ids)
                    # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(np.argmax(logits, axis=1))
                gold_labels.append(label_ids)
                inference_logits.append(logits)
                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            gold_labels = np.concatenate(gold_labels, 0)
            inference_logits = np.concatenate(inference_logits, 0)
            model.train()
            eval_loss = eval_loss / nb_eval_steps
            eval_accuracy = accuracy(inference_logits, gold_labels)

            result = {
                'eval_loss': eval_loss,
                'eval_F1': eval_accuracy,
                'global_step': global_step
            }

            output_eval_file = os.path.join(args.output_dir,
                                            "eval_results.txt")
            with open(output_eval_file, "a") as writer:
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write('*' * 80)
                writer.write('\n')
            if eval_accuracy > best_acc and 'dev' in file:
                print("=" * 80)
                print("Best F1", eval_accuracy)
                print("Saving Model......")
                best_acc = eval_accuracy
                # Save a trained model
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.output_dir,
                                                 "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
                print("=" * 80)
            else:
                print("=" * 80)

        model.train()

        for step in bar:
            batch = next(train_dataloader)
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss, _ = model(input_ids=input_ids,
                            token_type_ids=segment_ids,
                            attention_mask=input_mask,
                            labels=label_ids)
            nb_tr_examples += input_ids.size(0)
            del input_ids, input_mask, segment_ids, label_ids
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.fp16 and args.loss_scale != 1.0:
                loss = loss * args.loss_scale
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            train_loss = round(
                tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
                4)
            bar.set_description("loss {}".format(train_loss))

            nb_tr_steps += 1

            if args.fp16:
                optimizer.backward(loss)
            else:

                loss.backward()

            if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * warmup_linear.get_lr(
                        global_step, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1

            if (step + 1) % (args.eval_steps *
                             args.gradient_accumulation_steps) == 0:
                tr_loss = 0
                nb_tr_examples, nb_tr_steps = 0, 0
                logger.info("***** Report result *****")
                logger.info("  %s = %s", 'global_step', str(global_step))
                logger.info("  %s = %s", 'train loss', str(train_loss))

            if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and (
                    step + 1) % (args.eval_steps *
                                 args.gradient_accumulation_steps) == 0:
                for file in ['dev.csv']:
                    inference_labels = []
                    gold_labels = []
                    inference_logits = []
                    eval_examples = read_examples(os.path.join(
                        args.data_dir, file),
                                                  is_training=True)
                    eval_features = convert_examples_to_features(
                        eval_examples, tokenizer, args.max_seq_length,
                        args.split_num, False)
                    all_input_ids = torch.tensor(select_field(
                        eval_features, 'input_ids'),
                                                 dtype=torch.long)
                    all_input_mask = torch.tensor(select_field(
                        eval_features, 'input_mask'),
                                                  dtype=torch.long)
                    all_segment_ids = torch.tensor(select_field(
                        eval_features, 'segment_ids'),
                                                   dtype=torch.long)
                    all_label = torch.tensor([f.label for f in eval_features],
                                             dtype=torch.long)

                    eval_data = TensorDataset(all_input_ids, all_input_mask,
                                              all_segment_ids, all_label)

                    logger.info("***** Running evaluation *****")
                    logger.info("  Num examples = %d", len(eval_examples))
                    logger.info("  Batch size = %d", args.eval_batch_size)

                    # Run prediction for full data
                    eval_sampler = SequentialSampler(eval_data)
                    eval_dataloader = DataLoader(
                        eval_data,
                        sampler=eval_sampler,
                        batch_size=args.eval_batch_size)

                    model.eval()
                    eval_loss, eval_accuracy = 0, 0
                    nb_eval_steps, nb_eval_examples = 0, 0
                    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)

                        with torch.no_grad():
                            tmp_eval_loss, logits = model(
                                input_ids=input_ids,
                                token_type_ids=segment_ids,
                                attention_mask=input_mask,
                                labels=label_ids)
                            # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)

                        logits = logits.detach().cpu().numpy()
                        label_ids = label_ids.to('cpu').numpy()
                        inference_labels.append(np.argmax(logits, axis=1))
                        gold_labels.append(label_ids)
                        inference_logits.append(logits)
                        eval_loss += tmp_eval_loss.mean().item()
                        nb_eval_examples += input_ids.size(0)
                        nb_eval_steps += 1

                    gold_labels = np.concatenate(gold_labels, 0)
                    inference_logits = np.concatenate(inference_logits, 0)
                    model.train()
                    eval_loss = eval_loss / nb_eval_steps
                    eval_accuracy = accuracy(inference_logits, gold_labels)

                    result = {
                        'eval_loss': eval_loss,
                        'eval_F1': eval_accuracy,
                        'global_step': global_step,
                        'loss': train_loss
                    }

                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results.txt")
                    with open(output_eval_file, "a") as writer:
                        for key in sorted(result.keys()):
                            logger.info("  %s = %s", key, str(result[key]))
                            writer.write("%s = %s\n" % (key, str(result[key])))
                        writer.write('*' * 80)
                        writer.write('\n')
                    if eval_accuracy > best_acc and 'dev' in file:
                        print("=" * 80)
                        print("Best F1", eval_accuracy)
                        print("Saving Model......")
                        best_acc = eval_accuracy
                        # Save a trained model
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        print("=" * 80)
                    else:
                        print("=" * 80)
    if args.do_test:
        del model
        gc.collect()
        args.do_train = False
        model = BertForSequenceClassification_last2embedding_cls.from_pretrained(
            os.path.join(args.output_dir, "pytorch_model.bin"),
            args,
            config=config)
        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            print(flag, accuracy(logits, gold_labels))
            if flag == 'test':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub.csv"),
                                       index=False)
            if flag == 'dev':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub_dev.csv"),
                                       index=False)

    if args.predict_eval:
        del model
        gc.collect()
        args.do_train = False
        model = BertForSequenceClassification_last2embedding_cls.from_pretrained(
            os.path.join(args.output_dir, "pytorch_model.bin"),
            args,
            config=config)
        if args.fp16:
            model.half()
        model.to(device)
        if args.local_rank != -1:
            try:
                from apex.parallel import DistributedDataParallel as DDP
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            model = DDP(model)
        elif args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        for file, flag in [('dev.csv', 'dev')]:
            inference_labels = []
            gold_labels = []
            eval_examples = read_examples(os.path.join(args.data_dir, file),
                                          is_training=False)
            eval_features = convert_examples_to_features(
                eval_examples, tokenizer, args.max_seq_length, args.split_num,
                False)
            all_input_ids = torch.tensor(select_field(eval_features,
                                                      'input_ids'),
                                         dtype=torch.long)
            all_input_mask = torch.tensor(select_field(eval_features,
                                                       'input_mask'),
                                          dtype=torch.long)
            all_segment_ids = torch.tensor(select_field(
                eval_features, 'segment_ids'),
                                           dtype=torch.long)
            all_label = torch.tensor([f.label for f in eval_features],
                                     dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask,
                                      all_segment_ids, all_label)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_accuracy = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(
                        input_ids=input_ids,
                        token_type_ids=segment_ids,
                        attention_mask=input_mask).detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                inference_labels.append(logits)
                gold_labels.append(label_ids)
            gold_labels = np.concatenate(gold_labels, 0)
            logits = np.concatenate(inference_labels, 0)
            print(flag, accuracy(logits, gold_labels))
            if flag == 'dev':
                df = pd.read_csv(os.path.join(args.data_dir, file))
                df['label_0'] = logits[:, 0]
                df['label_1'] = logits[:, 1]
                df['label_2'] = logits[:, 2]
                df[['id', 'label_0', 'label_1',
                    'label_2']].to_csv(os.path.join(args.output_dir,
                                                    "sub_dev.csv"),
                                       index=False)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--dataset_name",
                        default="top300_kl", 
                        type=str, 
                        required=True, 
                        help="The name of dataset to inference (without extention ex) top300_kl)")
    parser.add_argument("--model_type",
                        default="baseline_tfidf", 
                        type=str, 
                        required=True, 
                        help="baseline, baseline_tfidf, ir-v0, ir-v1")
    parser.add_argument("--model_path",
                        default=None, 
                        type=str, 
                        required=True, 
                        help="path to model dir")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="save_path")

    ## Other parameters
    parser.add_argument("--bert_model",
                        default="bert-base-multilingual-cased",
                        type=str,
                        help="Default: bert-base-multilingual-cased" 
                         "Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--model_file",
                        default="pytorch_model.bin",
                        type=str,
                        help="The file of model (.bin), default is pytorhc_model.bin,\n" 
                             "특정 파일이 필요시 이름 설정 필요")
    parser.add_argument("--max_seq_length",
                        default=384,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    processor = IRProcessor()

    label_list = processor.get_labels()
    num_labels = len(label_list)

    print("model:", args.model_type)
    if args.model_type == "baseline": # load model (finetuned baseline on IR)
        tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False)
        config = BertConfig(os.path.join(args.model_path + "bert_config.json"))
        model = BertForPreTraining(config)
        model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file)))
    elif args.model_type == "baseline_tfidf": # load model (baseline_tfidf)
        tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True)
        TFIDFconfig = modeling.BertConfig(os.path.join(args.model_path + "bert_config.json"))
        model = modeling.BertTFIDFForPreTraining(TFIDFconfig)
        model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file)))
    elif args.model_type == "ir-v0": # load model (*-head)
        tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True)
        head_config = modeling_ir.BertForIRConfig(os.path.join(args.model_path + "bert_config.json"))
        model = modeling_ir.BertForIRForPreTraining(head_config)
        model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file)))
    elif args.model_type == "ir-v1": # load model (*-head)
        tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True)
        head_config = modeling_ir_2.BertForIRConfig(os.path.join(args.model_path + "bert_config.json"))
        model = modeling_ir_2.BertForIRForPreTraining(head_config)
        model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file)))

    if args.fp16:
        model.half()
    model.to(device)

    tfidf_dict = pickle_load(os.path.join(args.data_dir, args.dataset_name + '_tfidf.pkl'))

    results_logit = dict()
    results_softmax = dict()

    eval_set, documents, queries = processor.make_eval_set(args.data_dir, args.dataset_name)
    logger.info("***** Running evaluation *****")
    logger.info("  Batch size = %d", args.eval_batch_size)
    for q_num, query in tqdm(enumerate(queries), total=len(queries), desc="Evaluating"):
    # for query in queries[0:1]: # for testing

        logger.info(f"Current Query Num : {q_num}")
        eval_examples = processor._create_examples(eval_set, query, documents)
        # logger.info("  Num examples = %d", len(eval_examples))
        if args.model_type == "baseline": # baseline or baseline_finetuned
            eval_features = convert_examples_to_features_for_vanilla(
                eval_examples, label_list, args.max_seq_length, tokenizer)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.long)
            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []

            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Query"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    _, logits = model(input_ids, segment_ids, input_mask)

                # loss_fct = CrossEntropyLoss()
                # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                
                # eval_loss += tmp_eval_loss.mean().item()
                # nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
                else:
                    preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)
        else: # baseline_tfidf or *-head model
            eval_data = LazyDatasetClassifier(eval_examples, label_list, args.max_seq_length, tokenizer, tfidf_dict)
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []
                
            for batch in tqdm(eval_dataloader, desc="Query"):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_weights, input_mask, segment_ids, label_ids = batch

                with torch.no_grad():
                    _, logits = model(input_ids, input_weights, segment_ids, input_mask)
                
                # loss_fct = CrossEntropyLoss()
                # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                
                # eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
                else:
                    preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)


        # eval_loss = eval_loss / nb_eval_steps
        preds = preds[0]

        results_softmax[query] = []
        for i, pred in enumerate(softmax(preds)): # using softmax
            pair = dict()
            pair["score"] = pred[1]
            pair["doc_id"] = list(documents.keys())[i]
            results_softmax[query].append(pair)
        results_softmax[query].sort(reverse=True, key=lambda x: x["score"])

        ranked_doc_list = []
        for doc in results_logit[query]:
            ranked_doc_list.append(doc["doc_id"])
        results_logit[query] = ranked_doc_list

        ranked_doc_list = []
        for doc in results_softmax[query]:
            ranked_doc_list.append(doc["doc_id"])
        results_softmax[query] = ranked_doc_list

    save_name2 = args.model_path.split('/')[0] + '_' + args.model_file.split('.')[0] \
                 + '_' + args.dataset_name + '_output.json'
    path2 = os.path.join(args.output_dir,
                         save_name2)

    with open(path2, 'w', encoding="utf8") as f:
        json.dump(results_softmax, f, indent=4, sort_keys=True, ensure_ascii=False)
Example #6
0
def plot_examples(examples, name):
    clipped = torch.clamp(examples.detach(), 0, 1)
    image = make_grid(clipped)
    fig = Figure()
    canvas = backend.FigureCanvasAgg(fig)
    ax = fig.subplots()
    ax.set_title(name)
    ax.imshow(image.permute(1, 2, 0).numpy())
    canvas.print_figure(name)


device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_file = 'data/train.pt'
dataset = TensorDataset(torch.load(train_file))
loader = DataLoader(dataset, batch_size=16, shuffle=True)
writer = SummaryWriter()

decoder = 'sbd'
model = VAE(im_size=64, decoder=decoder)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

steps = 0
log = '[{:d}/{:d}] MSE: {:.6f}  KL: {:.6f}  Total: {:.6f}'
for epoch in range(100):
    print('Epoch {:d}'.format(epoch + 1))
    train_loss = 0
    train_mse = 0
    train_kl = 0
Example #7
0
    if args.deterministic:
        logging.info(
            "Running with deterministic sequence. Performance will be slower")
        torch.backends.cudnn.deterministic = True
        #         torch.backends.cudnn.enabled = False
        torch.backends.cudnn.benchmark = False

    #################################
    ####### Read data and precompute ######
    img = np.load(gdict['ip_fname'],
                  mmap_mode='r')[:gdict['num_imgs']].transpose(0, 1, 2,
                                                               3).copy()
    t_img = torch.from_numpy(img)
    logging.info("%s, %s" % (img.shape, t_img.shape))

    dataset = TensorDataset(t_img)
    dataloader = DataLoader(dataset,
                            batch_size=gdict['batchsize'],
                            shuffle=True,
                            num_workers=0,
                            drop_last=True)

    # Precompute metrics with validation data for computing losses
    with torch.no_grad():
        val_img = np.load(gdict['ip_fname'])[-3000:].transpose(0, 1, 2,
                                                               3).copy()
        t_val_img = torch.from_numpy(val_img).to(gdict['device'])

        # Precompute radial coordinates
        r, ind = f_get_rad(img)
        r = r.to(gdict['device'])
class processer():
    def __init__(self):
        pass

    def get_labels(self):
        return ['0', '1']

    def read_txt(self, filename):
        with open(filename, 'r') as rf:
            lines = rf.readlines()
        return lines

    def create_examples(self, data, type):
        examples = []
        for i, line in enumerate(data):
            guid = f'{i}-{line}'
            text_a = line.split('\t')[1]
            text_b = None
            label = line.split('\t')[3].replace('\n',
                                                '') if type != 'test' else '0'
            example = InputExample(guid=guid,
                                   text_a=text_a,
                                   text_b=text_b,
                                   label=label)
            examples.append(example)
        return examples

    def convert_examples_to_features(self,
                                     examples,
                                     tokenizer,
                                     max_length=512,
                                     label_list=None,
                                     output_mode=None,
                                     pad_on_left=False,
                                     pad_token=0,
                                     pad_token_segment_id=0,
                                     mask_padding_with_zero=True,
                                     split_num=4):
        """
        Loads a data file into a list of ``InputFeatures``
        Args:
            examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            task: CLUE task
            label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
            output_mode: String indicating the output mode. Either ``regression`` or ``classification``
            pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
            mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
                and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
                actual values)

        Returns:
            If the input is a list of ``InputExamples``, will return
            a list of task-specific ``InputFeatures`` which can be fed to the model.

        """
        label_map = {label: i for i, label in enumerate(label_list)}
        features = []
        for (ex_index, example) in enumerate(examples):
            if ex_index % 10000 == 0:
                logger.info("Writing example %d" % (ex_index))

                #***对长文本进行切分,将切分后的每一个子句作为一个单独完整的句子,计算feature***
                split_text_length = int(len(example.text_a) / split_num)
                split_features = []

                for i in range(split_num):
                    split_text = example.text_a[split_text_length *
                                                i:split_text_length * (i + 1)]

                    inputs = tokenizer.encode_plus(split_text,
                                                   example.text_b,
                                                   add_special_tokens=True,
                                                   max_length=max_length)
                    input_ids, token_type_ids = inputs["input_ids"], inputs[
                        "token_type_ids"]

                    # The mask has 1 for real tokens and 0 for padding tokens. Only real
                    # tokens are attended to.
                    attention_mask = [1 if mask_padding_with_zero else 0
                                      ] * len(input_ids)
                    input_len = len(input_ids)
                    # Zero-pad up to the sequence length.
                    padding_length = max_length - len(input_ids)

                    if pad_on_left:
                        input_ids = ([pad_token] * padding_length) + input_ids
                        attention_mask = ([0 if mask_padding_with_zero else 1]
                                          * padding_length) + attention_mask
                        token_type_ids = ([pad_token_segment_id] *
                                          padding_length) + token_type_ids
                    else:
                        input_ids = input_ids + ([pad_token] * padding_length)
                        attention_mask = attention_mask + (
                            [0 if mask_padding_with_zero else 1] *
                            padding_length)
                        token_type_ids = token_type_ids + (
                            [pad_token_segment_id] * padding_length)

                    assert len(
                        input_ids
                    ) == max_length, "Error with input length {} vs {}".format(
                        len(input_ids), max_length)
                    assert len(
                        attention_mask
                    ) == max_length, "Error with input length {} vs {}".format(
                        len(attention_mask), max_length)
                    assert len(
                        token_type_ids
                    ) == max_length, "Error with input length {} vs {}".format(
                        len(token_type_ids), max_length)
                    if output_mode == "classification":
                        label = label_map[example.label]
                    elif output_mode == "regression":
                        label = float(example.label)
                    else:
                        raise KeyError(output_mode)

                    if ex_index < 5:
                        logger.info("*** Example ***")
                        logger.info("guid: %s" % (example.guid))
                        logger.info("input_ids: %s" %
                                    " ".join([str(x) for x in input_ids]))
                        logger.info("attention_mask: %s" %
                                    " ".join([str(x) for x in attention_mask]))
                        logger.info("token_type_ids: %s" %
                                    " ".join([str(x) for x in token_type_ids]))
                        logger.info("label: %s (id = %s)" %
                                    (example.label, label))
                        logger.info("input length: %d" % (input_len))

                    split_features.append(
                        InputFeatures(input_ids=input_ids,
                                      attention_mask=attention_mask,
                                      token_type_ids=token_type_ids,
                                      label=label,
                                      input_len=input_len)
                    )  #split_features中包含的就是split_num个子句的InputFeatures对象

                features.append(split_features)

        return features

    def create_dataset(self, features):

        features_input_ids, features_attention_mask,features_token_type_ids,features_input_len,features_label= [],[],[],[],[]
        for split_features in features:
            split_features_input_ids, split_features_attention_mask,split_features_token_type_ids,split_features_input_len,split_features_label= [],[],[],[],[]

            split_features_input_ids.append(split_features_input_ids)
            split_features_attention_mask.append(split_features_attention_mask)
            split_features_token_type_ids.append(split_features_token_type_ids)
            split_features_input_len.append(split_features_input_len)
            split_features_label.append(split_features_label)

        features_input_ids.extend(split_features_input_ids)
        features_attention_mask.extend(split_features_attention_mask)
        features_token_type_ids.extend(split_features_token_type_ids)
        features_input_len.extend(split_features_input_len)
        features_attention_mask.extend(split_features_attention_mask)

    features_input_ids = torch.tensor(features_input_ids)
    features_attention_mask = torch.tensor(features_attention_mask)
    features_token_type_ids = torch.tensor(features_token_type_ids)
    features_input_len = torch.tensor(features_input_len)
    features_attention_mask = torch.tensor(features_attention_mask)

    print(all_input_ids.shape)
    print(all_attention_mask.shape)
    print(all_token_type_ids.shape)
    print(all_lens.shape)
    print(all_labels.shape)

    dataset = TensorDataset(features_input_ids, features_attention_mask,
                            features_token_type_ids, features_input_len,
                            features_attention_mask)
    return dataset
def startAutoEncoderGrey(dataset):
    global myDataYTrain
    global myDataXTrain
    global myDataYTest
    global myDataXTest
    if (dataset == 'SmallDataset_Q*Bert_Mixed_Greyscale'):
        split = False
        datasetTrain = load_dataset(
            '/home/annika/BA-Datensaetze/SmallDataset_Q*Bert_Mixed_Greyscale/SmallDatasetTraining_Q*Bert_Mixed_Greyscale.npy'
        )
        datasetTest = load_dataset(
            '/home/annika/BA-Datensaetze/SmallDataset_Q*Bert_Mixed_Greyscale/SmallDatasetTest_Q*Bert_Mixed_Greyscale.npy'
        )
        title = 'Auto-Encoder mit SmallDatasetTest_Q*Bert_Mixed_Greyscale'

    elif (dataset == 'SmallDataset_SpaceInvaders_Greyscale'):
        split = True
        datasetTrain = load_dataset(
            '/home/annika/BA-Datensaetze/SmallDataset_SpaceInvaders_Greyscale/smallDatasetTraining1_SpaceInvaders_Greyscale.npy'
        )
        datasetTrain2 = load_dataset(
            '/home/annika/BA-Datensaetze/SmallDataset_SpaceInvaders_Greyscale/smallDatasetTraining2_SpaceInvaders_Greyscale.npy'
        )
        datasetTest = load_dataset(
            '/home/annika/BA-Datensaetze/SmallDataset_SpaceInvaders_Greyscale/smallDatasetTest_SpaceInvaders_Greyscale.npy'
        )
        title = 'Auto-Encoder mit SmallDataset_SpaceInvaders_Greyscale'

    TrainingImage = Image.fromarray(datasetTrain[10])
    # firstImage.show()
    plt.imshow(TrainingImage, cmap=plt.get_cmap('gray'))
    plt.show()

    TestImage = Image.fromarray(datasetTest[10])
    # firstImage.show()
    plt.imshow(TestImage, cmap=plt.get_cmap('gray'))
    plt.show()

    ae = AutoEncoderWMGrey()
    #ae = AutoEncoderMFGrey()
    #ae = AutoEncoderGrey()
    #ae = AutoEncoderVAEGrey()
    ae.to(torch.device("cuda:0"))
    print(ae)

    # define our optimizer and loss function
    loss_func = nn.MSELoss()
    optimizer = torch.optim.Adamax(ae.parameters(), lr=4e-4)

    # losses = []

    global plotter
    plotter = VisdomLinePlotter(env_name='Tutorial Plots')

    if (split):
        iterationsTrain = ((len(datasetTrain) + len(datasetTrain2)) // 1000)
        firstIterationTrain = (len(datasetTrain) // 1000)
    else:
        iterationsTrain = (len(datasetTrain) // 1000)
        firstIterationTrain = iterationsTrain
    rest = False
    if (len(datasetTrain) % 1000 != 0):
        iterationsTrain += 1
        rest = True

    predictions = []

    epochs = 4

    for e in range(epochs):
        for i in range(iterationsTrain):
            train_snippet = i + (e * iterationsTrain)
            losses = []
            startTrain = i * 1000
            stopTrain = ((i + 1) * 1000) - 1
            if (split):
                if (i + 1 < firstIterationTrain):
                    trainSetSnippet = datasetTrain[startTrain:stopTrain, :, :]
                else:
                    if (i + 1 == firstIterationTrain):
                        trainSetSnippet = datasetTrain[startTrain:, :, :]
                    else:
                        startTrain = (i - firstIterationTrain) * 1000
                        stopTrain = ((i - firstIterationTrain + 1) * 1000) - 1
                        if (i + 1 == iterationsTrain):
                            trainSetSnippet = datasetTrain2[startTrain:, :, :]
                        else:
                            trainSetSnippet = datasetTrain2[
                                startTrain:stopTrain, :, :]
            else:
                if (i + 1 < firstIterationTrain):
                    trainSetSnippet = datasetTrain[startTrain:stopTrain, :, :]
                else:
                    trainSetSnippet = datasetTrain[startTrain:, :, :]

            trainSetSnippet = trainSetSnippet.reshape(len(trainSetSnippet),
                                                      210, 160, 1)
            #print(trainSetSnippet.shape)
            trainSetSnippet = normalize(trainSetSnippet)
            trn_torch = torch.from_numpy(trainSetSnippet).type(
                torch.cuda.FloatTensor)
            trn_torch = trn_torch.permute(0, 3, 1, 2)
            trn_torch = trn_torch[:, :, :, :]
            trn = TensorDataset(trn_torch, trn_torch)
            trn_dataloader = torch.utils.data.DataLoader(trn,
                                                         batch_size=1,
                                                         shuffle=False,
                                                         num_workers=0)

            startTest = i * 430
            stopTest = ((i + 1) * 430) - 1
            if (i + 1 == iterationsTrain):
                testSetSnippet = datasetTest[startTest:, :, :]
            else:
                testSetSnippet = datasetTest[startTest:stopTest, :, :]

            testSetSnippet = testSetSnippet.reshape(len(testSetSnippet), 210,
                                                    160, 1)
            #print(testSetSnippet.shape)
            testSetSnippet = normalize(testSetSnippet)
            test_torch = torch.from_numpy(testSetSnippet).type(
                torch.cuda.FloatTensor)
            test_torch = test_torch.permute(0, 3, 1, 2)
            test_torch = test_torch[:, :, :, :]
            test = TensorDataset(test_torch, test_torch)
            test_dataloader = torch.utils.data.DataLoader(test,
                                                          batch_size=20,
                                                          shuffle=False,
                                                          num_workers=0)

            # last_loss = 1
            for batch_idx, (data, target) in enumerate(trn_dataloader):

                data = torch.autograd.Variable(data)

                optimizer.zero_grad()

                pred = ae(data)

                loss = loss_func(pred, data)

                losses.append(loss.cpu().data.item())

                # Backpropagation
                loss.backward()

                optimizer.step()

                # Display
                if batch_idx % 25 == 1:
                    number = (((i + 1) * 1000))
                    if (i + 1 == iterationsTrain):
                        number = len(datasetTrain)

                    numberAll = number * (e + 1)
                    print(
                        '\r Images trained: {}/{} epochs: {}/{} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'
                        .format(i, iterationsTrain, e + 1, epochs,
                                batch_idx * len(data),
                                len(trn_dataloader.dataset),
                                100. * batch_idx / len(trn_dataloader),
                                loss.cpu().data.item()),
                        end='')

            # if(loss.cpu().data.item() <= last_loss):
            #    last_loss = loss.cpu().data.item()
            median_loss_train = statistics.median(losses)
            plotter.plot('loss', 'train', title, train_snippet + 1,
                         median_loss_train)
            if (i == 0 and e == 0):
                myDataXTrain.append(0)
                myDataYTrain.append(losses[0])
            myDataXTrain.append(train_snippet + 1)
            myDataYTrain.append(median_loss_train)

            ae.eval()

            loss_func_val = nn.MSELoss()
            losses_val = []
            for batch_idx, (data, target) in enumerate(test_dataloader):

                data = torch.autograd.Variable(data)

                pred = ae(data)

                for prediction in pred:
                    predictions.append(prediction)

                loss_val = loss_func_val(pred, data)
                losses_val.append(loss_val.cpu().data.item())
            print('\ntestLossSum = {}'.format(loss_val.cpu().data.item()))
            median_loss_test = statistics.median(losses_val)
            plotter.plot('loss', 'validation', title, train_snippet + 1,
                         median_loss_test)
            if (i == 0 and e == 0):
                myDataXTest.append(0)
                myDataYTest.append(losses_val[0])
            myDataXTest.append(train_snippet + 1)
            myDataYTest.append(median_loss_test)
            if ((i == (iterationsTrain - 1))
                    and (e == 0 or e == (epochs - 1))):
                test_torch = test_torch.permute(0, 2, 3, 1)
                yay = torch.tensor([255],
                                   dtype=torch.int,
                                   device=torch.device("cuda:0"))
                testImg = test_torch[2] * yay
                show_torch_image_Grey(testImg.reshape(210, 160))

                # * torch.tensor([255,255,255])
                predImg = predictions[2].permute(
                    1, 2, 0).detach() * torch.tensor(
                        [255], dtype=torch.int, device=torch.device("cuda:0"))
                show_torch_image_Grey(predImg.reshape(210, 160))
                test_torch = test_torch.permute(0, 3, 1, 2)
            predictions = []

    global episode
    global evaluationsfolder
    pathEvaluation = evaluationsfolder + "/" + 'Episode{}/ae.pt'.format(
        episode)
    torch.save(ae.state_dict(), pathEvaluation)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument("--output_file", default=None, type=str, required=True)
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")

    ## Other parameters
    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
    parser.add_argument("--max_seq_length", default=128, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
                            "than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help = "local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))

    layer_indexes = [int(x) for x in args.layers.split(",")]

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    examples = read_examples(args.input_file)

    features = convert_examples_to_features(
        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model = BertModel.from_pretrained(args.bert_model)
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)

    model.eval()
    with open(args.output_file, "w", encoding='utf-8') as writer:
        for input_ids, input_mask, example_indices in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)

            all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
            all_encoder_layers = all_encoder_layers

            for b, example_index in enumerate(example_indices):
                feature = features[example_index.item()]
                unique_id = int(feature.unique_id)
                # feature = unique_id_to_feature[unique_id]
                output_json = collections.OrderedDict()
                output_json["linex_index"] = unique_id
                all_out_features = []
                for (i, token) in enumerate(feature.tokens):
                    all_layers = []
                    for (j, layer_index) in enumerate(layer_indexes):
                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
                        layer_output = layer_output[b]
                        layers = collections.OrderedDict()
                        layers["index"] = layer_index
                        layers["values"] = [
                            round(x.item(), 6) for x in layer_output[i]
                        ]
                        all_layers.append(layers)
                    out_features = collections.OrderedDict()
                    out_features["token"] = token
                    out_features["layers"] = all_layers
                    all_out_features.append(out_features)
                output_json["features"] = all_out_features
                writer.write(json.dumps(output_json) + "\n")
Example #11
0
    print(x_validate0.shape)

    f = np.load('test_origin.npz')
    x_test0 = f['a']
    if testSizeEffect:
        sz = int(x_test0.shape[0] / 2)
        x_test0 = x_test0[:sz]
    print(x_test0.shape)
    return (x_train0, x_train, y_train, x_validate0, x_validate, y_validate,
            x_test0, x_test, y_test)


(x_train0, x_train, y_train, x_validate0, x_validate, y_validate, x_test0,
 x_test, y_test) = loadData()

dataset_train = TensorDataset(Tensor(x_train), Tensor(y_train))
train_loader = torch.utils.data.DataLoader(dataset_train,
                                           batch_size=seqLen,
                                           shuffle=False)

dataset_validate = TensorDataset(Tensor(x_validate), Tensor(y_validate))
validate_loader = torch.utils.data.DataLoader(dataset_validate,
                                              batch_size=seqLen,
                                              shuffle=False)

dataset_test = TensorDataset(Tensor(x_test), Tensor(y_test))
test_loader = torch.utils.data.DataLoader(dataset_test,
                                          batch_size=seqLen,
                                          shuffle=False)

net = Net()
Example #12
0
#get some random data around value
def get_data(value, shape):
    data = torch.ones(shape) * value
    #add some noise
    data += torch.randn(shape)**2
    return data


#dataset
#cat some data with different values
data = torch.cat(
    (get_data(0, (100, 1, 14, 14)), get_data(0.5, (100, 1, 14, 14))), 0)
#labels
labels = torch.cat((torch.zeros(100), torch.ones(100)), 0)
#generator
gen = DataLoader(TensorDataset(data, labels), batch_size=25, shuffle=True)
#network
m = M()
#loss and optim
loss = nn.NLLLoss()
optimizer = torch.optim.Adam(params=m.parameters())
#settings for train and log
num_epochs = 20
embedding_log = 5
writer = SummaryWriter(comment='mnist_embedding_training')

#TRAIN
for epoch in range(num_epochs):
    for j, sample in enumerate(gen):
        n_iter = (epoch * len(gen)) + j
        #reset grad
Example #13
0
                        pad_to_max_length = True,
                        return_attention_mask = True,  # Construct attn. masks.
                        return_tensors = 'pt',         # Return pytorch tensors.
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids, attention_masks, labels = torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

from torch.utils.data import TensorDataset, random_split

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('%d training samples'%(train_size))
print('%d validation samples'%(val_size))

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataloader = DataLoader(
            train_dataset,  
Example #14
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--vocab_file",
                        default='bert-base-uncased-vocab.txt',
                        type=str,
                        required=True)
    parser.add_argument("--model_file",
                        default='bert-base-uncased.tar.gz',
                        type=str,
                        required=True)
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )
    parser.add_argument(
        "--predict_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the predictions will be written.")
    parser.add_argument('--predict_output_file',
                        type=str,
                        default='predictions.json')
    parser.add_argument('--label_output_file',
                        type=str,
                        default='evidence_predictions.json')

    # Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=2.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    # Base setting
    parser.add_argument('--pretrain', type=str, default=None)
    parser.add_argument('--max_ctx', type=int, default=2)
    parser.add_argument('--task_name', type=str, default='coqa_yesno')
    parser.add_argument('--bert_name', type=str, default='baseline')
    parser.add_argument('--reader_name', type=str, default='coqa')
    # model parameters
    parser.add_argument('--evidence_lambda', type=float, default=0.8)
    parser.add_argument('--negative_lambda', type=float, default=1.0)
    parser.add_argument('--add_entropy', default=False, action='store_true')
    parser.add_argument('--split_num', type=int, default=3)
    parser.add_argument('--split_index', type=int, default=0)
    # Parameters for running labeling model
    parser.add_argument('--do_label', default=False, action='store_true')
    parser.add_argument('--sentence_id_file', type=str, default=None)
    parser.add_argument('--weight_threshold', type=float, default=0.0)
    parser.add_argument('--label_threshold', type=float, default=0.0)
    # negative sample parameters
    parser.add_argument('--do_negative_sampling',
                        default=False,
                        action='store_true')
    parser.add_argument('--read_extra_self',
                        default=False,
                        action='store_true')
    parser.add_argument('--sample_ratio', type=float, default=0.5)
    parser.add_argument('--extra_sen_file', type=str, default=None)
    parser.add_argument('--multi_inputs', default=False, action='store_true')

    args = parser.parse_args()

    logger = setting_logger(args.output_dir)
    logger.info('================== Program start. ========================')

    # model parameters
    model_params = prepare_model_params(args)

    # read parameters
    read_params = prepare_read_params(args)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if args.do_train:
        if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
            raise ValueError(
                "Output directory () already exists and is not empty.")
        os.makedirs(args.output_dir, exist_ok=True)

    if args.do_predict:
        os.makedirs(args.predict_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.vocab_file)

    data_reader = initialize_reader(args.reader_name)

    num_train_steps = None
    if args.do_train or args.do_label:
        train_examples = data_reader.read(input_file=args.train_file,
                                          **read_params)

        cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_{4}_{5}'.format(
            args.bert_model, str(args.max_seq_length), str(args.doc_stride),
            str(args.max_query_length), str(args.max_ctx), str(args.task_name))

        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except FileNotFoundError:
            train_features = data_reader.convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        num_train_steps = int(
            len(train_features) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Just for test
    no_evidence = 0
    for feature in train_features:
        if feature.sentence_id == -1:
            no_evidence += 1
    logger.info(
        f'No evidence ratio: {no_evidence} / {len(train_features)} = {no_evidence * 1.0 / len(train_features)}'
    )

    # Prepare model
    if args.pretrain is not None:
        logger.info('Load pretrained model from {}'.format(args.pretrain))
        model_state_dict = torch.load(args.pretrain, map_location='cuda:0')
        model = initialize_model(args.bert_name,
                                 args.model_file,
                                 state_dict=model_state_dict,
                                 **model_params)
    else:
        model = initialize_model(args.bert_name, args.model_file,
                                 **model_params)

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    # Prepare data
    if 'read_state' in read_params:
        read_params['read_state'] = ReadState.NoNegative
    eval_examples = data_reader.read(input_file=args.predict_file,
                                     **read_params)
    eval_features = data_reader.convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=False)

    eval_tensors = data_reader.data_to_tensors(eval_features)
    eval_data = TensorDataset(*eval_tensors)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.predict_batch_size)

    if args.do_train:

        if args.sentence_id_file is not None:
            logger.info('Training with evidence self-labeled data.')
            data_reader.generate_features_sentence_ids(train_features,
                                                       args.sentence_id_file)
        else:
            logger.info('No sentence id file found. Train in traditional way.')

        logger.info("Start training")
        train_loss = AverageMeter()
        best_acc = 0.0
        summary_writer = SummaryWriter(log_dir=args.output_dir)
        global_step = 0
        eval_loss = AverageMeter()

        train_tensors = data_reader.data_to_tensors(train_features)
        train_data = TensorDataset(*train_tensors)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            # Train
            model.train()
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                if n_gpu == 1:
                    batch = batch_to_device(
                        batch, device)  # multi-gpu does scattering it-self
                inputs = data_reader.generate_inputs(
                    batch, train_features, model_state=ModelState.Train)
                output_dict = model(**inputs)
                loss = output_dict['loss']
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                train_loss.update(loss.item(), args.train_batch_size)
                summary_writer.add_scalar('train_loss', train_loss.avg,
                                          global_step)

            # Evaluation
            model.eval()
            all_results = []
            logger.info("Start evaluating")
            for eval_step, batch in enumerate(
                    tqdm(eval_dataloader, desc="Evaluating")):
                if n_gpu == 1:
                    batch = batch_to_device(
                        batch, device)  # multi-gpu does scattering it-self
                inputs = data_reader.generate_inputs(
                    batch, eval_features, model_state=ModelState.Evaluate)
                with torch.no_grad():
                    output_dict = model(**inputs)
                    loss, batch_choice_logits = output_dict[
                        'loss'], output_dict['yesno_logits']
                    eval_loss.update(loss.item(), args.predict_batch_size)
                    summary_writer.add_scalar(
                        'eval_loss', eval_loss.avg,
                        epoch * len(eval_dataloader) + eval_step)
                example_indices = batch[-1]
                for i, example_index in enumerate(example_indices):
                    choice_logits = batch_choice_logits[i].detach().cpu(
                    ).tolist()

                    eval_feature = eval_features[example_index.item()]
                    unique_id = int(eval_feature.unique_id)
                    all_results.append(
                        RawResultChoice(unique_id=unique_id,
                                        choice_logits=choice_logits))

            data_reader.write_predictions(eval_examples,
                                          eval_features,
                                          all_results,
                                          None,
                                          null_score_diff_threshold=0.0)
            yes_metric = data_reader.yesno_cate.f1_measure('yes', 'no')
            no_metric = data_reader.yesno_cate.f1_measure('no', 'yes')
            current_acc = yes_metric['accuracy']
            summary_writer.add_scalar('eval_yes_f1', yes_metric['f1'], epoch)
            summary_writer.add_scalar('eval_yes_recall', yes_metric['recall'],
                                      epoch)
            summary_writer.add_scalar('eval_yes_precision',
                                      yes_metric['precision'], epoch)
            summary_writer.add_scalar('eval_no_f1', no_metric['f1'], epoch)
            summary_writer.add_scalar('eval_no_recall', no_metric['recall'],
                                      epoch)
            summary_writer.add_scalar('eval_no_precision',
                                      no_metric['precision'], epoch)
            summary_writer.add_scalar('eval_yesno_acc', current_acc, epoch)
            torch.cuda.empty_cache()

            if current_acc > best_acc:
                best_acc = current_acc
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.output_dir,
                                                 "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
            logger.info('Epoch: %d, Accuracy: %f (Best Accuracy: %f)' %
                        (epoch, current_acc, best_acc))
            data_reader.yesno_cate.reset()

        summary_writer.close()

    # Loading trained model.
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    model_state_dict = torch.load(output_model_file, map_location='cuda:0')
    model = initialize_model(args.bert_name,
                             args.model_file,
                             state_dict=model_state_dict,
                             **model_params)
    model.to(device)

    # Write Yes/No predictions
    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):

        test_examples = eval_examples
        test_features = eval_features

        test_tensors = data_reader.data_to_tensors(test_features)
        test_data = TensorDataset(*test_tensors)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.predict_batch_size)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(test_examples))
        logger.info("  Num split examples = %d", len(test_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start predicting yes/no on Dev set.")
        for batch in tqdm(test_dataloader, desc="Testing"):
            if n_gpu == 1:
                batch = batch_to_device(
                    batch, device)  # multi-gpu does scattering it-self
            inputs = data_reader.generate_inputs(batch,
                                                 test_features,
                                                 model_state=ModelState.Test)
            with torch.no_grad():
                output_dict = model(**inputs)
                batch_choice_logits = output_dict['yesno_logits']
            example_indices = batch[-1]
            for i, example_index in enumerate(example_indices):
                choice_logits = batch_choice_logits[i].detach().cpu().tolist()

                test_feature = test_features[example_index.item()]
                unique_id = int(test_feature.unique_id)

                all_results.append(
                    RawResultChoice(unique_id=unique_id,
                                    choice_logits=choice_logits))

        output_prediction_file = os.path.join(args.predict_dir,
                                              'predictions.json')
        data_reader.write_predictions(eval_examples,
                                      eval_features,
                                      all_results,
                                      output_prediction_file,
                                      null_score_diff_threshold=0.0)
        yes_metric = data_reader.yesno_cate.f1_measure('yes', 'no')
        no_metric = data_reader.yesno_cate.f1_measure('no', 'yes')
        logger.info('Yes Metrics: %s' % json.dumps(yes_metric, indent=2))
        logger.info('No Metrics: %s' % json.dumps(no_metric, indent=2))

    # Labeling sentence id.
    if args.do_label and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):

        test_examples = train_examples
        test_features = train_features

        test_tensors = data_reader.data_to_tensors(test_features)
        test_data = TensorDataset(*test_tensors)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.predict_batch_size)

        logger.info("***** Running labeling *****")
        logger.info("  Num orig examples = %d", len(test_examples))
        logger.info("  Num split examples = %d", len(test_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start labeling.")
        for batch in tqdm(test_dataloader, desc="Testing"):
            if n_gpu == 1:
                batch = batch_to_device(batch, device)
            inputs = data_reader.generate_inputs(batch,
                                                 test_features,
                                                 model_state=ModelState.Test)
            with torch.no_grad():
                output_dict = model(**inputs)
                batch_choice_logits = output_dict['yesno_logits']
                batch_max_weight_indexes = output_dict['max_weight_index']
                batch_max_weights = output_dict['max_weight']
            example_indices = batch[-1]
            for i, example_index in enumerate(example_indices):
                choice_logits = batch_choice_logits[i].detach().cpu().tolist()
                max_weight_index = batch_max_weight_indexes[i].detach().cpu(
                ).tolist()
                max_weight = batch_max_weights[i].detach().cpu().tolist()

                test_feature = test_features[example_index.item()]
                unique_id = int(test_feature.unique_id)

                all_results.append(
                    FullResult(unique_id=unique_id,
                               choice_logits=choice_logits,
                               max_weight_index=max_weight_index,
                               max_weight=max_weight))

        output_prediction_file = os.path.join(args.predict_dir,
                                              args.label_output_file)
        data_reader.write_sentence_predictions(
            test_examples,
            test_features,
            all_results,
            output_prediction_file,
            weight_threshold=args.weight_threshold,
            label_threshold=args.label_threshold)
  def _get_data(self, flag):
    """Function that creats a dataloader basd on flag.

    Args:
      flag: Flag indicating if we should return training/validation/testing
        dataloader

    Returns:
      data_loader: Dataloader for the required dataset.
    """
    # Here we initialize matrix that will store last past error
    # and selection probabilty for each expert by the gate
    if flag == "test":
      shuffle_flag = False
      drop_last = True
      batch_size = self.args.batch_size
      data_set = TensorDataset(
          torch.Tensor(self.data.test_x), torch.Tensor(self.data.test_index),
          torch.Tensor(self.data.test_y))
      self.past_test_error = torch.zeros(
          (len(self.data.test_x), self.num_experts),
          requires_grad=False).to(self.device)
      self.gate_weights_test = torch.ones(
          (len(self.data.test_x), self.num_experts), requires_grad=False).to(
              self.device) * 1 / self.num_experts

    elif flag == "pred":
      shuffle_flag = False
      drop_last = False
      # To take advantage of past error we process test dataset one by one
      # during final prediction.
      batch_size = 1
      data_set = TensorDataset(
          torch.Tensor(self.data.test_x), torch.Tensor(self.data.test_index),
          torch.Tensor(self.data.test_y))
      self.past_test_error = torch.zeros(
          (len(self.data.test_x), self.num_experts),
          requires_grad=False).to(self.device)

    elif flag == "val":
      shuffle_flag = False
      drop_last = False
      batch_size = self.args.batch_size
      data_set = TensorDataset(
          torch.Tensor(self.data.valid_x), torch.Tensor(self.data.valid_index),
          torch.Tensor(self.data.valid_y))
      self.past_val_error = torch.zeros(
          (len(self.data.valid_x), self.num_experts),
          requires_grad=False).to(self.device)
      self.gate_weights_val = torch.ones(
          (len(self.data.valid_x), self.num_experts), requires_grad=False).to(
              self.device) * 1 / self.num_experts
    else:
      shuffle_flag = False
      drop_last = True
      batch_size = self.args.batch_size
      data_set = TensorDataset(
          torch.Tensor(self.data.train_x), torch.Tensor(self.data.train_index),
          torch.Tensor(self.data.train_y))
      self.past_train_error = torch.zeros(
          (len(self.data.train_x), self.num_experts),
          requires_grad=False).to(self.device)
      self.gate_weights_train = torch.ones(
          (len(self.data.train_x), self.num_experts), requires_grad=False).to(
              self.device) * 1 / self.num_experts

      # Fitting past error matrix
      self.error_scaler.fit(
          self.past_train_error.detach().cpu().numpy().flatten().reshape(-1, 1))
    print("Data for", flag, "dataset size", len(data_set))

    data_loader = DataLoader(
        data_set,
        batch_size=batch_size,
        shuffle=shuffle_flag,
        num_workers=self.args.num_workers,
        drop_last=drop_last)

    if flag == "train":
      data_loader_shuffled = DataLoader(
          data_set,
          batch_size=batch_size,
          shuffle=True,
          num_workers=self.args.num_workers,
          drop_last=drop_last)

      return data_loader, data_loader_shuffled
    else:
      return data_loader
def run(config):
    seed = config['seed']
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)

    exp_dir = get_experiment_dir(config)

    run_dir = os.path.join(exp_dir, 'seed_{}'.format(config['seed']))
    # tensorboard logger
    writer = SummaryWriter(run_dir)

    # get data loaders and metrics function
    if config['dataset'] == 'openmic':
        (train_loader, val_loader,
         test_loader), (full_dataset, train_inds) = get_openmic_loaders(config)
        n_classes = 20
        metric_fn = evaluate.metrics.metric_fn_openmic
    elif config['dataset'] == 'sonyc':
        (train_loader, val_loader,
         test_loader), train_dataset = get_sonyc_loaders(config)
        if config['coarse']:
            n_classes = 8
        else:
            n_classes = 23
        metric_fn = evaluate.metrics.metric_fn_sonycust

        # Randomly remove labels
        if 'label_drop_rate' in config:
            label_drop_rate = config['label_drop_rate']
            drop_mask = np.random.rand(*train_dataset.Y_mask.shape)
            drop_mask = train_dataset.Y_mask + drop_mask
            train_dataset.Y_mask = drop_mask > (1 + label_drop_rate)

    # hyper params
    hparams = config['hparams']
    lr = hparams['lr']
    wd = hparams['wd']
    model_params = {
        'n_features': hparams['n_features'],
        'drop_rate': hparams['dropout'],
        'n_classes': n_classes,
        'n_layers': hparams['n_layers']
    }
    num_epochs = hparams['num_epochs']
    prune_thres = hparams['prune_thres']
    batch_size = hparams['batch_size']

    # initialize models
    model = create_model(model_params)

    # initialize criterion and optimizer
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    # initialize best metric variables
    best_models = [None, None]
    best_val_loss = 100000.0
    best_f1_macro = -1.0

    # teacher training loop
    for epoch in tqdm(range(num_epochs)):
        # drop learning rate every 30 epochs
        if (epoch > 0) and (epoch % 30 == 0):
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr * 0.5
                lr = lr * 0.5

        # first train treating all missing labels as negatives
        train_loss = trainer_baseline(model,
                                      train_loader,
                                      optimizer,
                                      criterion,
                                      baseline_type=0)
        print('#### Training ####')
        print('Loss: {}'.format(train_loss))

        val_loss, metrics = eval_baseline(model,
                                          val_loader,
                                          criterion,
                                          n_classes,
                                          metric_fn,
                                          baseline_type=1)
        val_metric = 'F1_macro' if config[
            'dataset'] == 'openmic' else 'auprc_macro'
        avg_val_metric = np.mean(metrics[val_metric])
        print('#### Validation ####')
        print('Loss: {}\t Macro F1 score: {}'.format(val_loss, avg_val_metric))

        # log to tensorboard
        writer.add_scalar("train/loss", train_loss, epoch)
        writer.add_scalar("val/loss_loss", val_loss, epoch)
        writer.add_scalar(f"val/{val_metric}", avg_val_metric, epoch)

        #Save best models
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_models[0] = deepcopy(model)

        if avg_val_metric > best_f1_macro:
            best_f1_macro = avg_val_metric
            best_models[1] = deepcopy(model)

    # Perform label pruning
    if config['dataset'] == 'openmic':
        X = full_dataset.X[train_inds]
        Y_mask = full_dataset.Y_mask[train_inds]
        X_dataset = TensorDataset(
            torch.tensor(X, requires_grad=False, dtype=torch.float32))
        loader = DataLoader(X_dataset, batch_size)
        all_predictions = forward(best_models[0], loader, n_classes)
        new_mask = get_enhanced_labels(Y_mask, all_predictions, prune_thres)
        full_dataset.Y_mask[train_inds] = new_mask

    if config['dataset'] == 'sonyc':
        X = train_dataset.X
        Y_mask = train_dataset.Y_mask
        X_dataset = TensorDataset(
            torch.tensor(X, requires_grad=False, dtype=torch.float32))
        loader = DataLoader(X_dataset, batch_size)
        all_predictions = forward(best_models[0], loader, n_classes)
        new_mask = get_enhanced_labels(Y_mask, all_predictions, prune_thres)
        train_dataset.Y_mask = new_mask
    # Retrain with pruned labels

    # initialize models
    model = create_model(model_params)

    # initialize optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    # initialize best metric variables
    best_models = [None, None]
    best_val_loss = 100000.0
    best_f1_macro = -1.0

    for epoch in tqdm(range(num_epochs)):
        # drop learning rate every 30 epochs
        if (epoch > 0) and (epoch % 30 == 0):
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr * 0.5
                lr = lr * 0.5

        # train with new mask
        train_loss = trainer_baseline(model,
                                      train_loader,
                                      optimizer,
                                      criterion,
                                      baseline_type=1)
        print('#### Training ####')
        print('Loss: {}'.format(train_loss))

        val_loss, metrics = eval_baseline(model,
                                          val_loader,
                                          criterion,
                                          n_classes,
                                          metric_fn,
                                          baseline_type=1)
        val_metric = 'F1_macro' if config[
            'dataset'] == 'openmic' else 'auprc_macro'
        avg_val_metric = np.mean(metrics[val_metric])
        print('#### Validation ####')
        print('Loss: {}\t Macro F1 score: {}'.format(val_loss, avg_val_metric))

        # log to tensorboard
        writer.add_scalar("train/loss", train_loss, epoch)
        writer.add_scalar("val/loss_loss", val_loss, epoch)
        writer.add_scalar(f"val/{val_metric}", avg_val_metric, epoch)

        #Save best models
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_models[0] = deepcopy(model)

        if avg_val_metric > best_f1_macro:
            best_f1_macro = avg_val_metric
            best_models[1] = deepcopy(model)

    # Test best models
    for i, model in enumerate(best_models):
        test_loss, metrics = eval_baseline(model,
                                           test_loader,
                                           criterion,
                                           n_classes,
                                           metric_fn,
                                           baseline_type=1)

        print('#### Testing ####')
        print('Test Loss: ', test_loss)
        for key, val in metrics.items():
            print(f'Test {key}: {np.mean(val)}')

        # save metrics and model
        torch.save(model.state_dict(), os.path.join(run_dir, f'model_{i}.pth'))
        np.save(os.path.join(run_dir, f'metrics_{i}'), metrics)

        # jsonify metrics and write to json as well for manual inspection
        js = {}
        for key, val in metrics.items():
            if not np.ndim(val) == 0:
                js[key] = val.tolist()
            else:
                js[key] = val
        json.dump(js, open(os.path.join(run_dir, f'metrics_{i}.json'), 'w'))
    json.dump(config, open(os.path.join(run_dir, f'config.json'), 'w'))
Example #17
0
from torch.utils.data import TensorDataset, DataLoader
from torch import nn


class SNN(nn.Module):
    def __init__(self, in_d, out_d):
        super().__init__()
        self.fc = nn.Linear(in_d, out_d, bias=False)
        nn.init.normal_(self.fc.weight, 0.0, 1.0)

    def forward(self, x):
        return self.fc(x)


model = SNN(300, 4)
dataset = TensorDataset(X_train, y_train)
dataloader = DataLoader(dataset, shuffle=True)

los_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

for epoch in range(10):
    for X, y in dataloader:
        optimizer.zero_grad()
        outputs = model(X)
        loss = loss_fun(outputs, y)
        loss.backward()
        optimizer.step()
        return x


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


folds = KFold(n_splits=5, shuffle=True, random_state=2019)
NN_predictions = np.zeros((test_X.shape[0], ))
oof_preds = np.zeros((train_X.shape[0], ))

x_test = np.array(test_X)
x_test = torch.tensor(x_test, dtype=torch.float)
if torch.cuda.is_available():
    x_test = x_test.cuda()
test = TensorDataset(x_test)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=False)

avg_losses_f = []
avg_val_losses_f = []

for fold_, (trn_, val_) in enumerate(folds.split(train_x)):
    print("fold {}".format(fold_ + 1))

    x_train = Variable(torch.Tensor(train_X[trn_.astype(int)]))
    y_train = Variable(torch.Tensor(train_y[trn_.astype(int), np.newaxis]))

    x_valid = Variable(torch.Tensor(train_X[val_.astype(int)]))
    y_valid = Variable(torch.Tensor(train_y[val_.astype(int), np.newaxis]))

    model = MLP(x_train.shape[1], 512, classes, dropout=0.3)
"""import data"""

# input_ids_train,attention_masks_train,role_type_ids_train,entity_type_ids_train,labels_train = prepare_input('ACE05_events_three_level_train_with_sent_id.json',event_type_dict,entity_type_dict,role_type_dict,tokenizer,tokenizer_max_len)

# input_ids_dev,attention_masks_dev,role_type_ids_dev,entity_type_ids_dev,labels_dev = prepare_input_emma('test_temp_three_level.json',event_type_dict,entity_type_dict,role_type_dict,tokenizer,tokenizer_max_len)
# input_ids_dev,attention_masks_dev,role_type_ids_dev,entity_type_ids_dev,labels_dev = prepare_input('ACE05_events_three_level_dev_with_sent_id.json',event_type_dict,entity_type_dict,role_type_dict,tokenizer,tokenizer_max_len)

input_ids, attention_masks, role_type_ids, entity_type_ids, labels = prepare_input_withIBO_multi_pair(
    event_type_dict, entity_type_dict, role_type_dict, tokenizer,
    tokenizer_max_len)

# input_ids_dev,attention_masks_dev,role_type_ids_dev,entity_type_ids_dev,labels_dev =prepare_input_withIBO_multi_pair('ACE05_events_three_level_dev_with_sent_id.json',event_type_dict,entity_type_dict,role_type_dict,tokenizer,tokenizer_max_len)
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, role_type_ids,
                        entity_type_ids, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
## train dev separate version
# # """split train and val dataset"""
# # from torch.utils.data import TensorDataset, random_split

# # # Combine the training inputs into a TensorDataset.
# # dataset_train = TensorDataset(input_ids_train, attention_masks_train,role_type_ids_train,entity_type_ids_train, labels_train)
# # dataset_dev = TensorDataset(input_ids_dev, attention_masks_dev,role_type_ids_dev,entity_type_ids_dev, labels_dev)

# # Create a 90-10 train-validation split.
def main():
    start_time = time.time()
    args = parse_args()
    make_directories(args.output_dir)

    # Start Tensorboard and log hyperparams.
    tb_writer = SummaryWriter(args.output_dir)
    tb_writer.add_hparams(vars(args), {})

    file_log_handler = logging.FileHandler(
        os.path.join(args.output_dir, 'log.txt'))
    logger.addHandler(file_log_handler)

    # Get list of text and list of label (integers) from disk.
    train_text, train_label_id_list, eval_text, eval_label_id_list = \
        get_examples_and_labels(args.dataset)

    # Augment training data.
    if (args.augmentation_recipe is not None) and len(
            args.augmentation_recipe):
        import pandas as pd

        if args.augmentation_recipe == 'textfooler':
            aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590551967800.csv'
        elif args.augmentation_recipe == 'tf-adjusted':
            aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590564015768.csv'
        else:
            raise ValueError(
                f'Unknown augmentation recipe {args.augmentation_recipe}')

        aug_df = pd.read_csv(aug_csv)

        # filter skipped outputs
        aug_df = aug_df[aug_df['original_text'] != aug_df['perturbed_text']]

        print(
            f'Augmentation recipe {args.augmentation_recipe} / augmentation num. examples {args.augmentation_num}/ len {len(aug_df)}'
        )

        original_text = aug_df['original_text']
        perturbed_text = aug_df['perturbed_text']

        # convert `train_text` and `train_label_id_list` to an np array so things are faster
        train_text = np.array(train_text)
        train_label_id_list = np.array(train_label_id_list)

        x_adv_list = []
        x_adv_id_list = []
        for (x, x_adv) in zip(original_text, perturbed_text):
            x = x.replace('[[', '').replace(']]', '')
            x_adv = x_adv.replace('[[', '').replace(']]', '')
            x_idx = (train_text == x).nonzero()[0][0]
            x_adv_label = train_label_id_list[x_idx]
            x_adv_id_list.append(x_adv_label)
            x_adv_list.append(x_adv)

        # truncate to `args.augmentation_num` examples
        if (args.augmentation_num >= 0):
            perm = list(range(len(x_adv_list)))
            random.shuffle(perm)
            perm = perm[:args.augmentation_num]
            x_adv_list = [x_adv_list[i] for i in perm]
            x_adv_id_list = [x_adv_id_list[i] for i in perm]

        train_text = train_text.tolist() + x_adv_list
        train_label_id_list = train_label_id_list.tolist() + x_adv_id_list

        print(
            f'Augmentation added {len(x_adv_list)} examples, for a total of {len(train_text)}'
        )

    label_id_len = len(train_label_id_list)
    num_labels = len(set(train_label_id_list))
    logger.info('num_labels: %s', num_labels)

    train_examples_len = len(train_text)

    if len(train_label_id_list) != train_examples_len:
        raise ValueError(
            f'Number of train examples ({train_examples_len}) does not match number of labels ({len(train_label_id_list)})'
        )
    if len(eval_label_id_list) != len(eval_text):
        raise ValueError(
            f'Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_label_id_list)})'
        )

    print_cuda_memory(args)
    # old INFO:__main__:Loaded data and tokenized in 189.66675066947937s

    # @TODO support other vocabularies, or at least, support case
    tokenizer = BertWordPieceTokenizer('bert-base-uncased-vocab.txt',
                                       lowercase=True)
    tokenizer.enable_padding(max_length=args.max_seq_len)
    tokenizer.enable_truncation(max_length=args.max_seq_len)

    logger.info(f'Tokenizing training data. (len: {train_examples_len})')
    train_text_ids = [
        encoding.ids for encoding in tokenizer.encode_batch(train_text)
    ]
    logger.info(f'Tokenizing test data (len: {len(eval_label_id_list)})')
    eval_text_ids = [
        encoding.ids for encoding in tokenizer.encode_batch(eval_text)
    ]
    load_time = time.time()
    logger.info(f'Loaded data and tokenized in {load_time-start_time}s')

    print_cuda_memory(args)

    # Load pre-trained model tokenizer (vocabulary)
    logger.info('Loading model: %s', args.model_dir)
    # Load pre-trained model (weights)
    logger.info(f'Model class: (vanilla) BertForSequenceClassification.')
    model = BertForSequenceClassification.from_pretrained(
        args.model_dir, num_labels=num_labels)

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    model.to(device)
    # print(model)

    # multi-gpu training
    if args.num_gpus > 1:
        model = torch.nn.DataParallel(model)
    logger.info(f'Training model across {args.num_gpus} GPUs')

    num_train_optimization_steps = int(
        train_examples_len / args.batch_size /
        args.grad_accum_steps) * args.num_train_epochs

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_proportion,
        num_training_steps=num_train_optimization_steps)

    global_step = 0

    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_examples_len)
    logger.info("  Batch size = %d", args.batch_size)
    logger.info("  Max sequence length = %d", args.max_seq_len)
    logger.info("  Num steps = %d", num_train_optimization_steps)

    wandb.log({'train_examples_len': train_examples_len})

    train_input_ids = torch.tensor(train_text_ids, dtype=torch.long)
    train_label_ids = torch.tensor(train_label_id_list, dtype=torch.long)
    train_data = TensorDataset(train_input_ids, train_label_ids)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size)

    eval_input_ids = torch.tensor(eval_text_ids, dtype=torch.long)
    eval_label_ids = torch.tensor(eval_label_id_list, dtype=torch.long)
    eval_data = TensorDataset(eval_input_ids, eval_label_ids)
    eval_sampler = RandomSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    def get_eval_acc():
        correct = 0
        total = 0
        for input_ids, label_ids in tqdm.tqdm(eval_dataloader,
                                              desc="Evaluating accuracy"):
            input_ids = input_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model(input_ids)[0]

            correct += (logits.argmax(dim=1) == label_ids).sum()
            total += len(label_ids)

        return float(correct) / total

    def save_model():
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, args.weights_name)
        output_config_file = os.path.join(args.output_dir, args.config_name)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)

        logger.info(
            f'Best acc found. Saved tokenizer, model config, and model to {args.output_dir}.'
        )

    global_step = 0

    def save_model_checkpoint(checkpoint_name=None):
        # Save model checkpoint
        checkpoint_name = checkpoint_name or 'checkpoint-{}'.format(
            global_step)
        output_dir = os.path.join(args.output_dir, checkpoint_name)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(output_dir)
        torch.save(args, os.path.join(output_dir, 'training_args.bin'))
        logger.info('Checkpoint saved to %s.', output_dir)

    print_cuda_memory(args)
    model.train()
    best_eval_acc = 0
    steps_since_best_eval_acc = 0

    def loss_backward(loss):
        if args.num_gpus > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.grad_accum_steps > 1:
            loss = loss / args.grad_accum_steps
        loss.backward()

    for epoch in tqdm.trange(int(args.num_train_epochs), desc="Epoch"):
        prog_bar = tqdm.tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(prog_bar):
            print_cuda_memory(args)
            batch = tuple(t.to(device) for t in batch)
            input_ids, labels = batch
            logits = model(input_ids)[0]
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = torch.nn.CrossEntropyLoss()(logits.view(-1, num_labels),
                                               labels.view(-1))
            if global_step % args.tb_writer_step == 0:
                tb_writer.add_scalar('loss', loss, global_step)
                tb_writer.add_scalar('lr', loss, global_step)
            loss_backward(loss)
            prog_bar.set_description(f"Loss {loss.item()}")
            if (step + 1) % args.grad_accum_steps == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
            # Save model checkpoint to file.
            if global_step % args.checkpoint_steps == 0:
                save_model_checkpoint()

            model.zero_grad()

            # Inc step counter.
            global_step += 1

        # Check accuracy after each epoch.
        eval_acc = get_eval_acc()
        tb_writer.add_scalar('epoch_eval_acc', eval_acc, global_step)
        wandb.log({'epoch_eval_acc': eval_acc, 'epoch': epoch})

        if args.checkpoint_every_epoch:
            save_model_checkpoint(f'epoch-{epoch}')

        logger.info(f'Eval acc: {eval_acc*100}%')
        if eval_acc > best_eval_acc:
            best_eval_acc = eval_acc
            steps_since_best_eval_acc = 0
            save_model()
        else:
            steps_since_best_eval_acc += 1
            if (args.early_stopping_epochs > 0) and (
                    steps_since_best_eval_acc > args.early_stopping_epochs):
                logger.info(
                    f'Stopping early since it\'s been {args.early_stopping_epochs} steps since validation acc increased'
                )
                break
Example #21
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = read_squad_examples(input_file=args.train_file,
                                             is_training=True)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForQuestionAnswering.from_pretrained(
        args.bert_model,
        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
        'distributed_{}'.format(args.local_rank))

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global_step = 0
    if args.do_train:
        cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
            args.bert_model, str(args.max_seq_length), str(args.doc_stride),
            str(args.max_query_length))
        train_features = None
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                examples=train_examples,
                tokenizer=tokenizer,
                max_seq_length=args.max_seq_length,
                doc_stride=args.doc_stride,
                max_query_length=args.max_query_length,
                is_training=True)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s",
                            cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(
                        global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    # Save a trained model
    model_to_save = model.module if hasattr(
        model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    torch.save(model_to_save.state_dict(), output_model_file)

    # Load a trained model that you have fine-tuned
    model_state_dict = torch.load(output_model_file)
    model = BertForQuestionAnswering.from_pretrained(
        args.bert_model, state_dict=model_state_dict)
    model.to(device)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = read_squad_examples(input_file=args.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, args.verbose_logging)
Example #22
0
def fit_neural(x, y):
    # convert to tensors
    X = torch.Tensor(x)
    X = X.view(len(x), 1)
    Y = torch.Tensor(y)
    Y = Y.view(len(x), 1)

    my_dataset = TensorDataset(X, Y)  # create your datset

    batchSize = 512
    train_loader = DataLoader(dataset=my_dataset,
                              batch_size=batchSize,
                              num_workers=2,
                              shuffle=True)
    #hyperparameters
    inputSize = 1
    hidden_size1 = 40
    hidden_size2 = 40
    outputSize = 1
    learning_rate = 0.001

    #Design model
    class NeuralNet(nn.Module):
        def __init__(self, input_size, hidden_size1, hidden_size2,
                     output_size):
            super(NeuralNet, self).__init__()
            self.input_size = input_size
            self.l1 = nn.Linear(input_size, hidden_size1)
            self.leaky_relu_1 = nn.LeakyReLU(negative_slope=0.3)
            self.l2 = nn.Linear(hidden_size1, hidden_size2)
            self.leaky_relu_2 = nn.LeakyReLU(negative_slope=0.3)
            self.l3 = nn.Linear(hidden_size2, outputSize)

        def forward(self, x):
            out = self.l1(x)
            out = self.leaky_relu_1(out)
            out = self.l2(out)
            out = self.leaky_relu_2(out)
            out = self.l3(out)
            return out

    global count10
    if count10 == 0:
        model = NeuralNet(inputSize, hidden_size1, hidden_size2, outputSize)
        count10 += 1
    else:
        model = NeuralNet(inputSize, hidden_size1, hidden_size2, outputSize)
        model.load_state_dict(
            torch.load(
                "/home/ppl/Documents/Universitet/KUKandidat/Speciale/DeepPricing/python/deepStopping/saveModel/ModelAM1.pth"
            ))
    #model = nn.Linear(inputSize, outputSize)

    #loss and optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    num_epochs = 1
    n_total_steps = len(train_loader)
    #enumereate epoch
    es = earlyStop.EarlyStopping(patience=1)
    for epoch in range(num_epochs):
        total_loss = 0
        n_samples = 0
        for i, (X, y) in enumerate(train_loader):  #one batch of samples
            optimizer.zero_grad()  # zero the gradient buffer
            #forward pass and loss
            y_predicted = model(X)
            loss = criterion(y_predicted, y)
            # Backward and optimize
            loss.backward()
            optimizer.step()  #does weight update
            #epoch_loss += loss
            # accumulate loss
            total_loss += loss.item() * X.shape[0]
            n_samples += X.shape[0]
        total_loss /= n_samples
        #if (epoch+1) % 10 == 0:
        #print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}')
        if es.step(total_loss):
            break  # early stop criterion is met, we can stop now
    #enumereate epoch
    torch.save(
        model.state_dict(),
        "/home/ppl/Documents/Universitet/KUKandidat/Speciale/DeepPricing/python/deepStopping/saveModel/ModelAM1.pth"
    )
    return model
Example #23
0
def get_bert_out(output_path, local_rank, no_cuda, batch_size):
    startt = timeit.default_timer()

    if local_rank == -1 or no_cuda:
        device = torch.device(
            "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {} distributed training: {}".format(
        device, n_gpu, bool(local_rank != -1)))

    model = BertModel.from_pretrained(args.bert_dir)
    model.to(device)
    # model.to(0)

    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    model.eval()
    sent_bert = numpy.load(output_path + "sen_bert.npy")
    sent_mask_bert = numpy.load(output_path + "sen_mask_bert.npy")

    f = open(output_path + "sent_output_bert.npy", 'ab')
    num = 0
    all_input_ids = torch.tensor(sent_bert, dtype=torch.int64).to(device)
    all_input_mask = torch.tensor(sent_mask_bert, dtype=torch.int64).to(device)
    all_example_index = torch.tensor(list(range(len(sent_bert))),
                                     dtype=torch.int64).to(device)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=batch_size)
    for input_ids, input_mask, example_indices in eval_dataloader:
        all_encoder_layers, _ = model(input_ids,
                                      token_type_ids=None,
                                      attention_mask=input_mask)
        all_encoder_layers = all_encoder_layers
        num += len(sent_bert)
        outs = []
        for b, example_index in enumerate(example_indices):
            layer_output = all_encoder_layers[-1].detach().cpu().numpy(
            )  # last layer
            layer_output = layer_output[b][:, :512]  # sent b
            # out = [round(x.item(), 6) for x in layer_output[0]]  # [CLS]
            # outs.append(out)
            outs.append(layer_output)  # all tokens-----------------
        outs = numpy.array(outs)
        numpy.save(f, outs)

    endt = timeit.default_timer()
    print(file=sys.stderr)
    print("Total use %.3f seconds for BERT Data Generating" % (endt - startt),
          file=sys.stderr)
def main():

    parser = argparse.ArgumentParser(
        description='Test code - measure the detection peformance')
    parser.add_argument('--eva_iter',
                        default=1,
                        type=int,
                        help='number of passes for mc-dropout when evaluation')
    parser.add_argument(
        '--model',
        type=str,
        choices=['base', 'manifold-smoothing', 'mc-dropout', 'temperature'],
        default='base')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='random seed for test')
    parser.add_argument("--epochs",
                        default=10,
                        type=int,
                        help="Number of epochs for training.")
    parser.add_argument('--index',
                        type=int,
                        default=0,
                        help='random seed you used during training')
    parser.add_argument('--in_dataset',
                        required=True,
                        help='target dataset: 20news')
    parser.add_argument('--out_dataset',
                        required=True,
                        help='out-of-dist dataset')
    parser.add_argument('--eval_batch_size', type=int, default=32)
    parser.add_argument('--saved_dataset', type=str, default='n')
    parser.add_argument(
        '--eps_out',
        default=0.001,
        type=float,
        help="Perturbation size of out-of-domain adversarial training")
    parser.add_argument("--eps_y",
                        default=0.1,
                        type=float,
                        help="Perturbation size of label")
    parser.add_argument(
        '--eps_in',
        default=0.0001,
        type=float,
        help="Perturbation size of in-domain adversarial training")

    args = parser.parse_args()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    args.device = device
    set_seed(args)

    outf = 'test/' + args.model + '-' + str(args.index)
    if not os.path.isdir(outf):
        os.makedirs(outf)

    if args.model == 'base':
        dirname = '{}/BERT-base-{}'.format(args.in_dataset, args.index)
        pretrained_dir = './model_save/{}'.format(dirname)
        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForSequenceClassification.from_pretrained(pretrained_dir)
        model.to(args.device)
        print('Load Tekenizer')

    elif args.model == 'mc-dropout':
        dirname = '{}/BERT-base-{}'.format(args.in_dataset, args.index)
        pretrained_dir = './model_save/{}'.format(dirname)
        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForSequenceClassification.from_pretrained(pretrained_dir)
        model.to(args.device)

    elif args.model == 'temperature':
        dirname = '{}/BERT-base-{}'.format(args.in_dataset, args.index)
        pretrained_dir = './model_save/{}'.format(dirname)
        orig_model = BertForSequenceClassification.from_pretrained(
            pretrained_dir)
        orig_model.to(args.device)
        model = ModelWithTemperature(orig_model)
        model.to(args.device)

    elif args.model == 'manifold-smoothing':
        dirname = '{}/BERT-mf-{}-{}-{}-{}'.format(args.in_dataset, args.index,
                                                  args.eps_in, args.eps_y,
                                                  args.eps_out)
        print(dirname)
        pretrained_dir = './model_save/{}'.format(dirname)
        model = BertForSequenceClassification.from_pretrained(pretrained_dir)
        model.to(args.device)

    if args.saved_dataset == 'n':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                  do_lower_case=True)
        train_sentences, val_sentences, test_sentences, train_labels, val_labels, test_labels = load_dataset(
            args.in_dataset)
        _, _, nt_test_sentences, _, _, nt_test_labels = load_dataset(
            args.out_dataset)

        val_input_ids = []
        test_input_ids = []
        nt_test_input_ids = []

        if args.in_dataset == '20news' or args.in_dataset == '20news-15':
            MAX_LEN = 150
        else:
            MAX_LEN = 256

        for sent in val_sentences:
            encoded_sent = tokenizer.encode(
                sent,  # Sentence to encode.
                add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                truncation=True,
                max_length=MAX_LEN,  # Truncate all sentences.
                #return_tensors = 'pt',     # Return pytorch tensors.
            )
            # Add the encoded sentence to the list.
            val_input_ids.append(encoded_sent)

        for sent in test_sentences:
            encoded_sent = tokenizer.encode(
                sent,  # Sentence to encode.
                add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                truncation=True,
                max_length=MAX_LEN,  # Truncate all sentences.
                #return_tensors = 'pt',     # Return pytorch tensors.
            )
            # Add the encoded sentence to the list.
            test_input_ids.append(encoded_sent)

        for sent in nt_test_sentences:
            encoded_sent = tokenizer.encode(
                sent,
                add_special_tokens=True,
                truncation=True,
                max_length=MAX_LEN,
            )
            nt_test_input_ids.append(encoded_sent)

        # Pad our input tokens
        val_input_ids = pad_sequences(val_input_ids,
                                      maxlen=MAX_LEN,
                                      dtype="long",
                                      truncating="post",
                                      padding="post")
        test_input_ids = pad_sequences(test_input_ids,
                                       maxlen=MAX_LEN,
                                       dtype="long",
                                       truncating="post",
                                       padding="post")
        nt_test_input_ids = pad_sequences(nt_test_input_ids,
                                          maxlen=MAX_LEN,
                                          dtype="long",
                                          truncating="post",
                                          padding="post")

        val_attention_masks = []
        test_attention_masks = []
        nt_test_attention_masks = []

        for seq in val_input_ids:
            seq_mask = [float(i > 0) for i in seq]
            val_attention_masks.append(seq_mask)
        for seq in test_input_ids:
            seq_mask = [float(i > 0) for i in seq]
            test_attention_masks.append(seq_mask)
        for seq in nt_test_input_ids:
            seq_mask = [float(i > 0) for i in seq]
            nt_test_attention_masks.append(seq_mask)

        val_inputs = torch.tensor(val_input_ids)
        val_labels = torch.tensor(val_labels)
        val_masks = torch.tensor(val_attention_masks)

        test_inputs = torch.tensor(test_input_ids)
        test_labels = torch.tensor(test_labels)
        test_masks = torch.tensor(test_attention_masks)

        nt_test_inputs = torch.tensor(nt_test_input_ids)
        nt_test_labels = torch.tensor(nt_test_labels)
        nt_test_masks = torch.tensor(nt_test_attention_masks)

        val_data = TensorDataset(val_inputs, val_masks, val_labels)
        test_data = TensorDataset(test_inputs, test_masks, test_labels)
        nt_test_data = TensorDataset(nt_test_inputs, nt_test_masks,
                                     nt_test_labels)

        dataset_dir = 'dataset/test'
        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)
        torch.save(
            val_data,
            dataset_dir + '/{}_val_in_domain.pt'.format(args.in_dataset))
        torch.save(
            test_data,
            dataset_dir + '/{}_test_in_domain.pt'.format(args.in_dataset))
        torch.save(
            nt_test_data,
            dataset_dir + '/{}_test_out_of_domain.pt'.format(args.out_dataset))

    else:
        dataset_dir = 'dataset/test'
        val_data = torch.load(dataset_dir +
                              '/{}_val_in_domain.pt'.format(args.in_dataset))
        test_data = torch.load(dataset_dir +
                               '/{}_test_in_domain.pt'.format(args.in_dataset))
        nt_test_data = torch.load(
            dataset_dir + '/{}_test_out_of_domain.pt'.format(args.out_dataset))


######## saved dataset
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data,
                                 sampler=test_sampler,
                                 batch_size=args.eval_batch_size)

    nt_test_sampler = SequentialSampler(nt_test_data)
    nt_test_dataloader = DataLoader(nt_test_data,
                                    sampler=nt_test_sampler,
                                    batch_size=args.eval_batch_size)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data,
                                sampler=val_sampler,
                                batch_size=args.eval_batch_size)

    if args.model == 'temperature':
        model.set_temperature(val_dataloader, args)

    model.eval()

    if args.model == 'mc-dropout':
        model.apply(apply_dropout)

    correct = 0
    total = 0
    output_list = []
    labels_list = []

    ##### validation dat
    with torch.no_grad():
        for step, batch in enumerate(val_dataloader):
            batch = tuple(t.to(args.device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            total += b_labels.shape[0]
            batch_output = 0
            for j in range(args.eva_iter):
                if args.model == 'temperature':
                    current_batch = model(input_ids=b_input_ids,
                                          token_type_ids=None,
                                          attention_mask=b_input_mask)  #logits
                else:
                    current_batch = model(
                        input_ids=b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)[0]  #logits
                batch_output = batch_output + F.softmax(current_batch, dim=1)
            batch_output = batch_output / args.eva_iter
            output_list.append(batch_output)
            labels_list.append(b_labels)
            score, predicted = batch_output.max(1)
            correct += predicted.eq(b_labels).sum().item()

    ###calculate accuracy and ECE
    val_eval_accuracy = correct / total
    print("Val Accuracy: {}".format(val_eval_accuracy))
    ece_criterion = ECE_v2().to(args.device)
    softmaxes_ece = torch.cat(output_list)
    labels_ece = torch.cat(labels_list)
    val_ece = ece_criterion(softmaxes_ece, labels_ece).item()
    print('ECE on Val data: {}'.format(val_ece))

    #### Test data
    correct = 0
    total = 0
    output_list = []
    labels_list = []
    predict_list = []
    true_list = []
    true_list_ood = []
    predict_mis = []
    predict_in = []
    score_list = []
    correct_index_all = []
    ## test on in-distribution test set
    with torch.no_grad():
        for step, batch in enumerate(test_dataloader):
            batch = tuple(t.to(args.device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            total += b_labels.shape[0]
            batch_output = 0
            for j in range(args.eva_iter):
                if args.model == 'temperature':
                    current_batch = model(input_ids=b_input_ids,
                                          token_type_ids=None,
                                          attention_mask=b_input_mask)  #logits
                else:
                    current_batch = model(
                        input_ids=b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)[0]  #logits
                batch_output = batch_output + F.softmax(current_batch, dim=1)
            batch_output = batch_output / args.eva_iter
            output_list.append(batch_output)
            labels_list.append(b_labels)
            score, predicted = batch_output.max(1)

            correct += predicted.eq(b_labels).sum().item()

            correct_index = (predicted == b_labels)
            correct_index_all.append(correct_index)
            score_list.append(score)

    ###calcutae accuracy
    eval_accuracy = correct / total
    print("Test Accuracy: {}".format(eval_accuracy))

    ##calculate ece
    ece_criterion = ECE_v2().to(args.device)
    softmaxes_ece = torch.cat(output_list)
    labels_ece = torch.cat(labels_list)
    ece = ece_criterion(softmaxes_ece, labels_ece).item()
    print('ECE on Test data: {}'.format(ece))

    #confidence for in-distribution data
    score_in_array = torch.cat(score_list)
    #indices of data that are classified correctly
    correct_array = torch.cat(correct_index_all)
    label_array = torch.cat(labels_list)

    ### test on out-of-distribution data
    predict_ood = []
    score_ood_list = []
    true_list_ood = []
    with torch.no_grad():
        for step, batch in enumerate(nt_test_dataloader):
            batch = tuple(t.to(args.device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            batch_output = 0
            for j in range(args.eva_iter):
                if args.model == 'temperature':
                    current_batch = model(b_input_ids,
                                          token_type_ids=None,
                                          attention_mask=b_input_mask)
                else:
                    current_batch = model(b_input_ids,
                                          token_type_ids=None,
                                          attention_mask=b_input_mask)[0]
                batch_output = batch_output + F.softmax(current_batch, dim=1)
            batch_output = batch_output / args.eva_iter
            score_out, _ = batch_output.max(1)

            score_ood_list.append(score_out)

    score_ood_array = torch.cat(score_ood_list)

    label_array = label_array.cpu().numpy()
    score_ood_array = score_ood_array.cpu().numpy()
    score_in_array = score_in_array.cpu().numpy()
    correct_array = correct_array.cpu().numpy()

    ####### calculate NBAUCC for detection task
    predict_o = np.zeros(len(score_in_array) + len(score_ood_array))
    true_o = np.ones(len(score_in_array) + len(score_ood_array))
    true_o[:len(score_in_array
                )] = 0  ## in-distribution data as false, ood data as positive
    true_mis = np.ones(len(score_in_array))
    true_mis[
        correct_array] = 0  ##true instances as false, misclassified instances as positive
    predict_mis = np.zeros(len(score_in_array))

    ood_sum = 0
    mis_sum = 0

    ood_sum_list = []
    mis_sum_list = []

    #### upper bound of the threshold tau for NBAUCC
    stop_points = [0.50, 1.]

    for threshold in np.arange(0., 1.01, 0.02):
        predict_ood_index1 = (score_in_array < threshold)
        predict_ood_index2 = (score_ood_array < threshold)
        predict_ood_index = np.concatenate(
            (predict_ood_index1, predict_ood_index2), axis=0)
        predict_o[predict_ood_index] = 1
        predict_mis[score_in_array < threshold] = 1

        ood = f1_score(true_o, predict_o, average='binary'
                       )  ##### detection f1 score for a specific threshold
        mis = f1_score(true_mis, predict_mis, average='binary')

        ood_sum += ood * 0.02
        mis_sum += mis * 0.02

        if threshold in stop_points:
            ood_sum_list.append(ood_sum)
            mis_sum_list.append(mis_sum)

    for i in range(len(stop_points)):
        print('OOD detection, NBAUCC {}: {}'.format(
            stop_points[i], ood_sum_list[i] / stop_points[i]))
        print('misclassification detection, NBAUCC {}: {}'.format(
            stop_points[i], mis_sum_list[i] / stop_points[i]))
Example #25
0
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

test_inputs = torch.tensor(input_ids)
test_masks = torch.tensor(attention_masks)
test_tags = torch.tensor(tags)


test_data = TensorDataset(test_inputs, test_masks, test_tags)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()



model = RobertaForTokenClassification.from_pretrained(output_dir)
model.cuda()


Example #26
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval or not.")
    parser.add_argument("--eval_on",
                        default="dev",
                        help="Whether to run eval on the dev set or test set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {"ner": NerProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    processor = processors[task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = 0
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)

        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # Prepare model
    config = BertConfig.from_pretrained(args.bert_model,
                                        num_labels=num_labels,
                                        finetuning_task=args.task_name)
    model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    label_map = {i: label for i, label in enumerate(label_list, 1)}
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in train_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids,
                                   all_valid_ids, all_lmask_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        # def inplace_relu(m):
        #     classname = m.__class__.__name__
        #     if classname.find('ReLU') != -1:
        #         m.inplace = True
        #
        # model.apply(inplace_relu)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids,
                             valid_ids, l_mask)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

            # Save a trained model and the associated configuration
            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self
            model_to_save.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)
            label_map = {i: label for i, label in enumerate(label_list, 1)}
            model_config = {
                "bert_model": args.bert_model,
                "do_lower": args.do_lower_case,
                "max_seq_length": args.max_seq_length,
                "num_labels": len(label_list) + 1,
                "label_map": label_map
            }
            json.dump(
                model_config,
                open(os.path.join(args.output_dir, "model_config.json"), "w"))
            # Load a trained model and config that you have fine-tuned
    else:
        # Load a trained model and vocabulary that you have fine-tuned
        model = Ner.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        if args.eval_on == "dev":
            eval_examples = processor.get_dev_examples(args.data_dir)
        elif args.eval_on == "test":
            eval_examples = processor.get_test_examples(args.data_dir)
        else:
            raise ValueError("eval on dev or test set only")
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in eval_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids,
                                  all_valid_ids, all_lmask_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            valid_ids = valid_ids.to(device)
            label_ids = label_ids.to(device)
            l_mask = l_mask.to(device)

            with torch.no_grad():
                logits = model(input_ids,
                               segment_ids,
                               input_mask,
                               valid_ids=valid_ids,
                               attention_mask_label=l_mask)

            logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            input_mask = input_mask.to('cpu').numpy()

            for i, label in enumerate(label_ids):
                temp_1 = []
                temp_2 = []
                for j, m in enumerate(label):
                    if j == 0:
                        continue
                    elif label_ids[i][j] == len(label_map):
                        y_true.append(temp_1)
                        y_pred.append(temp_2)
                        break
                    else:
                        temp_1.append(label_map[label_ids[i][j]])
                        temp_2.append(label_map[logits[i][j]])

        report = classification_report(y_true, y_pred, digits=4)
        logger.info("\n%s", report)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)
Example #27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gpu_ids", default='0', type=str)
    parser.add_argument("--bert_config_file",
                        default='check_points/pretrain_models/bert_wwm_ext_base/bert_config.json',
                        type=str,
                        help="The config json file corresponding to the pre-trained BERT model. "
                             "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default='check_points/pretrain_models/bert_wwm_ext_base/vocab.txt',
                        type=str,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--init_restore_dir",
                        required=True,
                        type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--input_dir", required=True, default='dataset/CHID')
    parser.add_argument("--output_dir", required=True, type=str,
                        help="The output directory where the model checkpoints and predictions will be written.")
    parser.add_argument("--predict_file",
                        required=True,
                        type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument('--output_file', type=str, default='predictions_test.json')

    ## Other parameters
    parser.add_argument("--max_seq_length", default=64, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--max_num_choices", default=10, type=int,
                        help="The maximum number of cadicate answer,  shorter than this will be padded.")
    parser.add_argument("--predict_batch_size", default=16, type=int, help="Total batch size for predictions.")
    parser.add_argument("--do_lower_case",
                        default=True,
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument('--fp16',
                        default=True,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")

    args = parser.parse_args()
    print(args)
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids

    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    print("device: {}, distributed training: {}, 16-bits training: {}".format(device, bool(args.local_rank != -1),
                                                                              args.fp16))

    tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    test_example_file = os.path.join(args.input_dir, 'test_examples_{}.pkl'.format(str(args.max_seq_length)))
    test_feature_file = os.path.join(args.input_dir, 'test_features_{}.pkl'.format(str(args.max_seq_length)))

    eval_features = generate_input(args.predict_file, None, test_example_file, test_feature_file, tokenizer,
                                   max_seq_length=args.max_seq_length, max_num_choices=args.max_num_choices,
                                   is_training=False)

    # Prepare model
    if 'albert' in args.bert_config_file:
        if 'google' in args.bert_config_file:
            bert_config = AlbertConfig.from_json_file(args.bert_config_file)
            model = AlbertForMultipleChoice(bert_config, num_choices=args.max_num_choices)
        else:
            bert_config = ALBertConfig.from_json_file(args.bert_config_file)
            model = ALBertForMultipleChoice(bert_config, num_choices=args.max_num_choices)
    else:
        bert_config = BertConfig.from_json_file(args.bert_config_file)
        model = BertForMultipleChoice(bert_config, num_choices=args.max_num_choices)
    model = model.to(device)
    if args.init_restore_dir.endswith('.pth') or \
            args.init_restore_dir.endswith('.pt') or \
            args.init_restore_dir.endswith('.bin'):
        pass
    else:
        args.init_restore_dir = glob(args.init_restore_dir + '*.pth')
        assert len(args.init_restore_dir) == 1
        args.init_restore_dir = args.init_restore_dir[0]
    torch_init_model(model, args.init_restore_dir)
    if args.fp16:
        model = model.half()

    print("***** Running predictions *****")
    print("Num split examples = %d", len(eval_features))
    print("Batch size = %d", args.predict_batch_size)

    all_example_ids = [f.example_id for f in eval_features]
    all_tags = [f.tag for f in eval_features]
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_masks = torch.tensor([f.input_masks for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_choice_masks = torch.tensor([f.choice_masks for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_masks, all_segment_ids, all_choice_masks,
                              all_example_index)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

    model.eval()
    all_results = []
    print("Start evaluating")
    for input_ids, input_masks, segment_ids, choice_masks, example_indices in tqdm(eval_dataloader,
                                                                                   desc="Evaluating",
                                                                                   disable=None):
        if len(all_results) == 0:
            print('shape of input_ids: {}'.format(input_ids.shape))
        input_ids = input_ids.to(device)
        input_masks = input_masks.to(device)
        segment_ids = segment_ids.to(device)
        with torch.no_grad():
            batch_logits = model(input_ids=input_ids,
                                 token_type_ids=segment_ids,
                                 attention_mask=input_masks,
                                 labels=None)
        for i, example_index in enumerate(example_indices):
            logits = batch_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(RawResult(unique_id=unique_id,
                                         example_id=all_example_ids[unique_id],
                                         tag=all_tags[unique_id],
                                         logit=logits))
    else:
        print("prediction is over")

    print('decoder raw results')
    tmp_predict_file = os.path.join(args.output_dir, "test_raw_predictions.pkl")
    output_prediction_file = os.path.join(args.output_dir, args.output_file)
    results = get_final_predictions(all_results, tmp_predict_file, g=True)
    write_predictions(results, output_prediction_file)
    print('predictions saved to {}'.format(output_prediction_file))
Example #28
0
def get_data_loaders(args, tokenizer):
    """ Prepare the dataset for training and evaluation """
    personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)

    logger.info("Build inputs and labels")
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for dataset_name, dataset in personachat.items():
        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
        if args.num_candidates > 0 and dataset_name == 'train':
            num_candidates = min(args.num_candidates, num_candidates)
        for dialog in dataset:
            persona = dialog["personality"].copy()
            for _ in range(args.personality_permutations):
                for utterance in dialog["utterances"]:
                    history = utterance["history"][-(2 * args.max_history +
                                                     1):]
                    for j, candidate in enumerate(
                            utterance["candidates"][-num_candidates:]):
                        lm_labels = bool(j == num_candidates - 1)
                        instance, _ = build_input_from_segments(
                            persona, history, candidate, tokenizer, lm_labels)
                        for input_name, input_array in instance.items():
                            datasets[dataset_name][input_name].append(
                                input_array)
                    datasets[dataset_name]["mc_labels"].append(num_candidates -
                                                               1)
                    datasets[dataset_name]["n_candidates"] = num_candidates
                persona = [persona[-1]
                           ] + persona[:-1]  # permuted personalities

    logger.info("Pad inputs and convert to Tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(dataset,
                              padding=tokenizer.convert_tokens_to_ids(
                                  SPECIAL_TOKENS[-1]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            if input_name != "mc_labels":
                tensor = tensor.view((-1,
                                      datasets[dataset_name]["n_candidates"]) +
                                     tensor.shape[1:])
            tensor_datasets[dataset_name].append(tensor)

    logger.info("Build train and validation dataloaders")
    train_dataset, valid_dataset = TensorDataset(
        *tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset) if args.distributed else None
    valid_sampler = torch.utils.data.distributed.DistributedSampler(
        valid_dataset) if args.distributed else None
    train_loader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=args.train_batch_size,
                              shuffle=(not args.distributed))
    valid_loader = DataLoader(valid_dataset,
                              sampler=valid_sampler,
                              batch_size=args.valid_batch_size,
                              shuffle=False)

    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(
        train_dataset.tensors[0].shape))
    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(
        valid_dataset.tensors[0].shape))
    return train_loader, valid_loader, train_sampler, valid_sampler
Example #29
0
# %%

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

# %%

# Set batch num
# Only set token embedding, attention embedding, no segment embedding
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size=batch_num,
                              drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data,
                              sampler=valid_sampler,
                              batch_size=batch_num)
# %%
model_file_address = 'bert-base-cased'
model = BertForTokenClassification.from_pretrained(
Example #30
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {
        "fever": FeverProcessor,
    }

    output_modes = {
        "fever": "classification",
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        # train_examples=train_examples[0:50] #debugging
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(
            args.local_rank))
    model = BertForSequenceClassification.from_pretrained(
        args.bert_model, cache_dir=cache_dir, num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
        warmup_linear = WarmupLinearSchedule(
            warmup=args.warmup_proportion,
            t_total=num_train_optimization_steps)

    # else:  #testing
    #     optimizer = BertAdam(optimizer_grouped_parameters,
    #                          lr=args.learning_rate,
    #                          warmup=args.warmup_proportion,
    #                          t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer, output_mode)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features],
                                         dtype=torch.long)
        # elif output_mode == "regression":
        #     all_label_ids = torch.tensor([f.athene_label_id for f in train_features], dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model(input_ids, segment_ids, input_mask, labels=None)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, num_labels),
                                    label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
            print('printing loss')
            print('training_loss~=', tr_loss / nb_tr_steps)

    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForSequenceClassification.from_pretrained(
            args.output_dir, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
    else:
        # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)   #testing
        model = BertForSequenceClassification.from_pretrained(
            args.output_dir, num_labels=num_labels)  #testing
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)  #testing
    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        # eval_examples=eval_examples[0:100] #debugging
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer, 'no label')
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)

        # add guid to batch
        guid1 = list()
        guid2 = list()
        guid_list = ([f.guid for f in eval_examples])
        for g in range(len(eval_examples)):
            pair_id = guid_list[g].find('-', 0)
            evidence_id = guid_list[g].find('_', 0)
            guid1.append(int(guid_list[g][pair_id + 1:evidence_id]))
            guid2.append(int(guid_list[g][evidence_id + 1:]))
        Guid1 = torch.tensor(guid1, dtype=torch.long)
        Guid2 = torch.tensor(guid2, dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        store_output = list()
        for input_ids, input_mask, segment_ids in tqdm(eval_dataloader,
                                                       desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            # label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask, labels=None)
                store_output.extend(logits.cpu().numpy())

            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = np.append(preds[0],
                                     logits.detach().cpu().numpy(),
                                     axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = preds[0]
        if output_mode == "classification":
            preds = np.argmax(preds, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(preds)

        print("Storing dev scores")
        output_eval_file = os.path.join(args.output_dir,
                                        "test_logits.p")  #testing
        pickle_in = open(output_eval_file, 'wb')
        pickle.dump(store_output, pickle_in)
        pickle_in.close()