Esempio n. 1
0
 def get_train_examples(self, filename=None):
     #with open(os.path.join(filename), "r", encoding="utf-8") as reader:
     #    input_data = json.load(reader)
     input_data = readGZip(filename)
     return self._create_examples(input_data, "train")
Esempio n. 2
0
 def get_dev_examples(self, filename=None):
     input_data = readGZip(filename)
     return self._create_examples(input_data, "dev")
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument(
        "--option",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        help=
        "The output directory where the model checkpoints and predictions will be written.",
    )
    parser.add_argument(
        "--model_type",
        default='bert',
        type=str,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default="bert-base-uncased",
        type=str,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help=
        "The input training file. If a data dir is specified, will look for the file there"
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument("--logging_steps",
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=5000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help=
        "The input evaluation file. If a data dir is specified, will look for the file there"
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--resource_dir",
        type=str,
        default='data/',
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--table_path",
                        type=str,
                        default='traindev_tables_tok/',
                        help="table path.")
    parser.add_argument("--request_path",
                        type=str,
                        default='traindev_request_tok/',
                        help="request path.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="/tmp/",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--stage1_model",
        default=None,
        type=str,
        help="Where to load the trained model from",
    )
    parser.add_argument(
        "--stage2_model",
        default=None,
        type=str,
        help="Where to load the trained model from",
    )
    parser.add_argument(
        "--dim",
        default=None,
        type=int,
        help="Where to load the trained model from",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    args = parser.parse_args()

    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    if args.do_train:
        args.output_dir = args.option
        args.output_dir = os.path.join(
            args.output_dir,
            datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))
    else:
        assert args.output_dir != None or (
            args.stage1_model
            and args.stage2_model), "You must set an output dir"

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    # Set seed
    set_seed(args)

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    args.dim = config.hidden_size

    if args.option in ['stage1', 'stage2']:
        if args.option == 'stage1':
            model = FilterModel(model_class,
                                args.model_name_or_path,
                                config,
                                args.cache_dir,
                                dim=args.dim)
            model.to(args.device)
        else:
            model = JumpModel(model_class,
                              args.model_name_or_path,
                              config,
                              args.cache_dir,
                              dim=args.dim)
            model.to(args.device)
    elif args.option == 'stage12':
        filter_model = FilterModel(model_class,
                                   args.model_name_or_path,
                                   config,
                                   args.cache_dir,
                                   dim=args.dim)
        filter_model.to(args.device)
        jump_model = JumpModel(model_class,
                               args.model_name_or_path,
                               config,
                               args.cache_dir,
                               dim=args.dim)
        jump_model.to(args.device)
    else:
        raise NotImplementedError

    logger.info("Training/evaluation parameters %s", args)

    if args.do_train:
        train_data = readGZip(args.train_file)
        dataset = Stage12Dataset(args.resource_dir, train_data, tokenizer, args.max_seq_length, args.option, \
                                 tables=args.table_path, retain_label=True, shuffle=True)
        loader = DataLoader(dataset,
                            batch_size=None,
                            batch_sampler=None,
                            num_workers=8,
                            shuffle=False,
                            pin_memory=True)

        tb_writer = SummaryWriter(log_dir=args.output_dir)
        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)

        t_total = len(
            train_data
        ) // args.gradient_accumulation_steps * args.num_train_epochs

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=t_total)

        # Check if saved optimizer or scheduler states exist
        if os.path.isfile(os.path.join(
                args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                    os.path.join(args.model_name_or_path, "scheduler.pt")):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(
                torch.load(
                    os.path.join(args.model_name_or_path, "optimizer.pt")))
            scheduler.load_state_dict(
                torch.load(
                    os.path.join(args.model_name_or_path, "scheduler.pt")))

        if args.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args.fp16_opt_level)

        # multi-gpu training (should be after apex fp16 initialization)
        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        global_step = 0

        tr_loss, logging_loss = 0.0, 0.0
        model.train()
        model.zero_grad()

        train_iterator = trange(0, int(args.num_train_epochs), desc="Epoch")
        for epoch in train_iterator:
            for step, batch in enumerate(tqdm(loader, desc="Iteration")):
                *data, labels = tuple(
                    Variable(t).to(args.device) for t in batch)
                probs = model(*data)

                loss = torch.sum(-torch.log(probs + 1e-8) * labels)

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer), args.max_grad_norm)
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                # Log metrics
                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    #if args.local_rank == -1 and args.evaluate_during_training:
                    #    results = evaluate(args, model, tokenizer)
                    #    for key, value in results.items():
                    #        tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("{}_lr".format(args.option),
                                         scheduler.get_last_lr()[0],
                                         global_step)
                    tb_writer.add_scalar("{}_loss".format(args.option),
                                         (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

            # Save model checkpoint
            output_dir = os.path.join(args.output_dir,
                                      "checkpoint-epoch{}".format(epoch))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            # Take care of distributed/parallel training
            model_to_save = model.module if hasattr(model, "module") else model
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            torch.save(args, os.path.join(output_dir, "training_args.bin"))

        tb_writer.close()

    elif args.do_eval and args.option == 'stage12':
        with open(args.table_path, 'r') as f:
            tables = json.load(f)
        with open(args.request_path, 'r') as f:
            requests = json.load(f)
        logger.info('finished reading tables and requests.')

        dev_data = readGZip(args.predict_file)
        filtered_dev_data = []
        for d in dev_data:
            if len(d['nodes']) > 0:
                filtered_dev_data.append(d)
            else:
                print("filter out {}".format(d['table_id']))
        dev_data = filtered_dev_data

        filter_model.eval()
        jump_model.eval()

        #assert args.model_name_or_path is not None, "please provide the load_from argument"
        model_path = os.path.join(args.stage1_model, 'pytorch_model.bin')
        filter_model.load_state_dict(torch.load(model_path))

        model_path = os.path.join(args.stage2_model, 'pytorch_model.bin')
        jump_model.load_state_dict(torch.load(model_path))

        pred_data = copy.copy(dev_data)
        succ, total = 0, 0

        dataset = Stage12Dataset(args.resource_dir,
                                 dev_data,
                                 tokenizer,
                                 args.max_seq_length,
                                 'stage1',
                                 tables=tables,
                                 requests=requests,
                                 retain_label=False,
                                 shuffle=False)
        loader = DataLoader(dataset,
                            batch_size=None,
                            batch_sampler=None,
                            num_workers=8,
                            shuffle=False,
                            pin_memory=True)

        for step, batch in enumerate(tqdm(loader, desc="Evaluation")):
            data = tuple(Variable(t).to(args.device) for t in batch[:-1])
            probs = filter_model(*data)

            info = dev_data[batch[-1]]
            info['nodes'] = [info['nodes'][torch.argmax(probs, 0).item()]]
            info = dataset.generate_target_nodes(info)

            selected_target_nodes = []
            inner_dataset = Stage12Dataset(args.resource_dir,
                                           info,
                                           tokenizer,
                                           args.max_seq_length,
                                           'stage2',
                                           tables=tables,
                                           retain_label=False,
                                           shuffle=False)
            for b in inner_dataset:
                data = tuple(Variable(t).to(args.device) for t in b[:-1])
                probs = jump_model(*data)
                tmp = info[b[-1]]['target']
                selected_target_nodes.append(tmp[torch.argmax(probs,
                                                              0).item()])

            discovered_node = selected_target_nodes[0]
            pred_data[step]['target'] = discovered_node
            if not discovered_node[2]:
                pred_data[step]['pred'] = discovered_node[0]
            else:
                pred_data[step]['pred'] = [
                    discovered_node[0], discovered_node[2]
                ]

        #print("FINAL: correct = {}, total = {}, correct rate = {} \n".format(succ, total, succ / total))
        with open('predictions.intermediate.json', 'w') as f:
            json.dump(pred_data, f, indent=2)