Esempio n. 1
0

def train_model(config):
    transformer = volume_normalize('background')
    if config.test == True:
        layoutdata = dataset[config.data][1]
    else:
        layoutdata = dataset[config.data][0]
    train_db = layoutdata(config, split='train', transform=transformer)
    val_db = layoutdata(config, split='val', transform=transformer)
    # test_db  = layoutdata(config, split='val',  transform=transformer)

    trainer = SupervisedTrainer(train_db)
    trainer.train(train_db, val_db, val_db)


if __name__ == '__main__':
    cv2.setNumThreads(0)
    config, unparsed = get_config()
    config.bert = True
    config.if_sample = False
    config = layout_arguments(config)
    np.random.seed(config.seed)
    random.seed(config.seed)
    torch.manual_seed(config.seed)
    if (config.cuda):
        torch.cuda.manual_seed_all(config.seed)
    prepare_directories(config)

    train_model(config)
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--codah_dir",
        type=str,
        required=True,
        help=
        "The input data dir. Should contain train.tsv and dev.tsv files for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--train_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--train_size",
                        default=0.8,
                        type=float,
                        help="Percentage of the data use for training.")
    parser.add_argument("--learning_rate",
                        default=1e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=6,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument(
        "--categories",
        type=str,
        default="all",
        help=
        'String with the categories to be included or excluded separated by "-" ej: "o-i-q-n".'
    )
    parser.add_argument(
        "--exclude_categories",
        default=False,
        action="store_true",
        help=
        "the categories listed in `--categories` will be excluded,  must not be use if all categories are listed."
    )
    parser.add_argument(
        "--local_model",
        default=False,
        action="store_true",
        help=
        "This is to load the bert model from a local ckpt tensorflow index instead of downloading it."
    )
    parser.add_argument(
        "--use_pooled",
        default=False,
        action="store_true",
        help="Use the pooler output instead of the normal cls.")
    parser.add_argument("--bert_dir",
                        type=str,
                        help="The directori to load ckpt index of bert model.")
    parser.add_argument("--use_bert_adam",
                        default=False,
                        action="store_true",
                        help="Use build in BertAdam class instead of Adam.")

    args = parser.parse_args()

    print(args)

    if args.categories == "all":
        categories = CodahProcessor.get_all_categories()
    else:
        categories = set(args.categories.split('-'))

    cfg, _ = get_config()
    cfg.cuda = True
    transformer = volume_normalize('background')
    db = layout_coco(cfg, split='train', transform=transformer)

    processor = CodahProcessor(path=args.codah_dir,
                               categories=categories,
                               exclude=args.exclude_categories)
    print(" Initializing tokenizer ")
    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    print(" Creating train and dev datasets ")
    train_examples, eval_examples = processor.get_train_dev_examples(
        args.train_size)
    num_train_examples = len(train_examples)
    train_data = convert_examples_to_features(train_examples,
                                              processor.get_labels(),
                                              args.max_seq_length, tokenizer,
                                              db)
    train_sampler = RandomSampler(train_data)
    train_loader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size=args.train_batch_size)

    eval_data = convert_examples_to_features(eval_examples,
                                             processor.get_labels(),
                                             args.max_seq_length, tokenizer,
                                             db)
    eval_sampler = RandomSampler(eval_data)
    eval_loader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)

    print(" Initializing bert model ")

    # model = CodahClasifier(model_type=args.bert_model,
    #                          from_tf=args.local_model,
    #                          tf_dir=args.model_dir,
    #                          use_pooled_output=args.use_pooled,
    #                          freeze_bert=False).cuda()

    # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=1)
    model = CodahClassifier(args.bert_model, db)
    model.cuda()
    train_and_validate(model,
                       train_loader,
                       eval_loader,
                       tokenizer,
                       processor,
                       args.num_train_epochs,
                       args.learning_rate,
                       args.train_batch_size,
                       num_train_examples,
                       args.warmup,
                       print_every=10,
                       use_bert_adam=args.use_bert_adam)