Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", default='../models/query-simplifier-bs2-e4', type=str,
                        help="Path to pre-trained model or shortcut name")
    parser.add_argument('--input_file', default='../data/ms_marco/marco_ann_session.dev.all.filtered.tsv', type=str,
                        help="Input json file for predictions. Do not add fold suffix when cross validate, i.e. use 'data/eval_topics.jsonl' instead of 'data/eval_topics.jsonl.0'")
    parser.add_argument('--output_file', default='../data/weak_data_div/self-learn.jsonl', type=str,
                        help="Output json file for predictions")

    parser.add_argument("--length", type=int, default=20,
                        help="Maximum length of output sequence")
    parser.add_argument("--temperature", type=float, default=0.0,
                        help="temperature of 0 implies greedy sampling")
    parser.add_argument("--top_p", type=float, default=0.9)
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args)

    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)

    MAX_LENGTH = 100
    if args.length < 0:
        args.length = MAX_LENGTH  # avoid infinite loop

    model_path = args.model_path
    i = 1
    args.model_path = "%s-%d" % (model_path, i)
    logger.info("Predict using Model {}".format(args.model_path))
    inference_model = InferenceModel(args)
    output_file = "%s.%d" % (args.output_file, i)
    with open(args.input_file, 'r') as fin, open(output_file, 'w') as fout:
        all_lines = fin.readlines()
        for line in tqdm(all_lines, desc="Predict"):
            splitted = (line[:-1] if line[-1] == '\n' else line).split('\t')
            queries = splitted[1:]
            topic_number = splitted[0]
            i = 1
            predictions = [queries[0]]
            for query in queries[1:]:
                i += 1
                input_sents = queries[:i]
                prediction = inference_model.predict(input_sents).strip()
                predictions.append(prediction)
                target_sent = query
                if prediction == target_sent.strip():
                    continue

                output_line = json.dumps(
                    {"topic_number": topic_number, "query_number": i, "input": predictions, "target": target_sent})
                fout.write(output_line + "\n")
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", default=None, type=str, required=True,
                        help="Path to pre-trained model or shortcut name")
    parser.add_argument('--input_file', type=str, required=True,
                        help="Input json file for predictions. Do not add fold suffix when cross validate, i.e. use 'data/eval_topics.jsonl' instead of 'data/eval_topics.jsonl.0'")
    parser.add_argument('--output_file', type=str, required=True,
                        help="Output json file for predictions")
    parser.add_argument("--cross_validate", action='store_true',
                        help="Set when doing cross validation")

    parser.add_argument("--length", type=int, default=20,
                        help="Maximum length of output sequence")
    parser.add_argument("--temperature", type=float, default=0.0,
                        help="temperature of 0 implies greedy sampling")
    parser.add_argument("--top_p", type=float, default=0.9)
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args)

    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO)

    MAX_LENGTH = 100
    if args.length < 0:
        args.length = MAX_LENGTH  # avoid infinite loop

    if not args.cross_validate:
        inference_model = InferenceModel(args)
        with open(args.input_file , 'r') as fin, open(args.output_file, 'w') as fout:
            for line in tqdm(fin, desc="Predict"):
                record = json.loads(line)
                prediction = inference_model.predict(record['input'])
                record['output'] = prediction
                fout.write(json.dumps(record) + '\n')
    else:
        # K-Fold Cross Validation
        model_path = args.model_path
        with open(args.output_file, 'w') as fout:
            for i in range(NUM_FOLD):
                logger.info("Predict Fold #{}".format(i))
                args.model_path = "%s-%d" % (model_path, i)
                inference_model = InferenceModel(args)
                input_file = "%s.%d" % (args.input_file, i)
                with open(input_file , 'r') as fin:
                    for line in tqdm(fin, desc="Predict"):
                        record = json.loads(line)
                        prediction = inference_model.predict(record['input'])
                        record['output'] = prediction
                        fout.write(json.dumps(record) + '\n')
    logger.info("Prediction saved to %s", args.output_file)
def train(args, train_dataset, model, tokenizer, logger, cross_validate_id=-1):
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate_fn)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel & accumulation) = %d",
                args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = (batch[2], batch[3])  # get ids and labels
            inputs = inputs.to(args.device)  # batch_size * block_size
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            del inputs
            del outputs
            torch.cuda.empty_cache()

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            del loss
            torch.cuda.empty_cache()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = 'checkpoint'
                    output_dir = args.output_dir + (
                        ('-' + str(cross_validate_id))
                        if cross_validate_id != -1 else "")
                    # Save model checkpoint
                    output_dir = os.path.join(
                        output_dir, '{}-{}'.format(checkpoint_prefix,
                                                   global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    return global_step, tr_loss / global_step
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--model_name_or_path",
        default="gpt2-medium",
        type=str,
        help="The model checkpoint for weights initialization.")
    parser.add_argument(
        "--block_size",
        default=150,
        type=int,
        help="Optional input sequence length after tokenization."
        "The training dataset will be truncated in block of this size for training."
        "Default to the model max input length for single sentence inputs (take into account special tokens)."
    )
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        required=True,
        help=
        "Path of training file. Do not add fold suffix when cross validate, i.e. use 'data/eval_topics.jsonl' instead of 'data/eval_topics.jsonl.0'"
    )
    parser.add_argument("--cross_validate",
                        action='store_true',
                        help="Set when doing cross validation")
    parser.add_argument(
        "--init_from_multiple_models",
        action='store_true',
        help=
        "Set when initialize from different models during cross validation (Model-based+CV)"
    )

    parser.add_argument("--per_gpu_train_batch_size",
                        default=4,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=1.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning("device: %s, n_gpu: %s", device, args.n_gpu)

    # Set seed
    set_seed(args)

    config_class, model_class, tokenizer_class = GPT2Config, GPT2LMHeadModel, GPT2Tokenizer

    if not args.cross_validate:
        config = config_class.from_pretrained(args.model_name_or_path)
        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
        tokenizer.add_special_tokens(special_tokens_dict)
        model = model_class.from_pretrained(args.model_name_or_path)
        model.resize_token_embeddings(len(tokenizer))  # resize
        model.to(args.device)

        if args.block_size <= 0:
            args.block_size = tokenizer.max_len_single_sentence
        args.block_size = min(args.block_size,
                              tokenizer.max_len_single_sentence)

        # Training
        logger.info("Training/evaluation parameters %s", args)
        train_dataset = QueryRewriteDataset([args.train_file], tokenizer, args)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     logger)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

        # Saving
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

    else:
        # K-Fold Cross Validation
        for i in range(NUM_FOLD):
            logger.info("Training Fold #{}".format(i))
            suffix = ('-' + str(i)) if args.init_from_multiple_models else ''
            config = config_class.from_pretrained(args.model_name_or_path +
                                                  suffix)
            tokenizer = tokenizer_class.from_pretrained(
                args.model_name_or_path + suffix)
            tokenizer.add_special_tokens(special_tokens_dict)
            model = model_class.from_pretrained(args.model_name_or_path +
                                                suffix)
            model.resize_token_embeddings(len(tokenizer))  # resize
            model.to(args.device)

            if args.block_size <= 0:
                args.block_size = tokenizer.max_len_single_sentence
            args.block_size = min(args.block_size,
                                  tokenizer.max_len_single_sentence)

            logger.info("Training/evaluation parameters %s", args)
            train_files = [
                "%s.%d" % (args.train_file, j) for j in range(NUM_FOLD)
                if j != i
            ]
            logger.info("train_files: {}".format(train_files))
            train_dataset = QueryRewriteDataset(train_files, tokenizer, args)
            global_step, tr_loss = train(args,
                                         train_dataset,
                                         model,
                                         tokenizer,
                                         logger,
                                         cross_validate_id=i)
            logger.info(" global_step = %s, average loss = %s", global_step,
                        tr_loss)

            # Create output directory if needed
            output_dir = args.output_dir + '-' + str(i)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            logger.info("Saving model checkpoint to %s", output_dir)
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            torch.save(args, os.path.join(output_dir, 'training_args.bin'))

            del model
            torch.cuda.empty_cache()
Esempio n. 5
0
def main():

    args = get_command()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    # args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning("device: %s, n_gpu: %s", device, args.n_gpu)

    # Set seed
    set_seed(args)

    config_class, model_class, tokenizer_class = T5Config, T5ForConditionalGeneration, T5Tokenizer

    if not args.cross_validate:
        config = config_class.from_pretrained(args.model_name_or_path)
        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
        tokenizer.add_special_tokens(special_tokens_dict)
        model = model_class.from_pretrained(args.model_name_or_path)
        model.resize_token_embeddings(len(tokenizer))  # resize
        model.to(args.device)

        if args.block_size <= 0:
            args.block_size = tokenizer.max_len_single_sentence
        args.block_size = min(args.block_size,
                              tokenizer.max_len_single_sentence)

        # Training
        logger.info("Training/evaluation parameters %s", args)
        train_dataset = QueryRewriteDataset([args.train_file], tokenizer, args)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     logger)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

        # Saving
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

    else:
        # K-Fold Cross Validation
        for i in range(NUM_FOLD):
            logger.info("Training Fold #{}".format(i))
            suffix = ('-' + str(i)) if args.init_from_multiple_models else ''
            config = config_class.from_pretrained(args.model_name_or_path +
                                                  suffix)
            tokenizer = tokenizer_class.from_pretrained(
                args.model_name_or_path + suffix)
            tokenizer.add_special_tokens(special_tokens_dict)
            model = model_class.from_pretrained(args.model_name_or_path +
                                                suffix)
            model.resize_token_embeddings(len(tokenizer))  # resize
            model.to(args.device)

            if args.block_size <= 0:
                args.block_size = tokenizer.max_len_single_sentence
            args.block_size = min(args.block_size,
                                  tokenizer.max_len_single_sentence)

            logger.info("Training/evaluation parameters %s", args)
            train_files = [
                "%s.%d" % (args.train_file, j) for j in range(NUM_FOLD)
                if j != i
            ]
            logger.info("train_files: {}".format(train_files))
            train_dataset = QueryRewriteDataset(train_files, tokenizer, args)
            global_step, tr_loss = train(args,
                                         train_dataset,
                                         model,
                                         tokenizer,
                                         logger,
                                         cross_validate_id=i)
            logger.info(" global_step = %s, average loss = %s", global_step,
                        tr_loss)

            # Create output directory if needed
            output_dir = args.output_dir + '-' + str(i)
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

            logger.info("Saving model checkpoint to %s", output_dir)
            model_to_save = model.module if hasattr(model, 'module') else model
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)
            torch.save(args, os.path.join(output_dir, 'training_args.bin'))

            del model
            torch.cuda.empty_cache()