예제 #1
0
def init_experiment(args):
    """Initialize experiment."""
    # Check for CUDA
    device = utils.init_device(args.cuda)

    # Set random seed for reproducibility
    if getattr(args, 'seed', -1) >= 0:
        utils.set_seed(args.seed)

    # Setup experiment directory
    if not args.experiment_dir:
        args.experiment_dir = os.path.join('results/adhoc',
                                           time.strftime('%Y%m%d'))
    if not args.experiment_name:
        args.experiment_name = str(uuid.uuid4())[:8]

    args.path = os.path.join(args.experiment_dir, args.experiment_name)
    utils.init_dir(args.path)

    # Filenames inside experiment directory
    args.model_dir = os.path.join(args.path, 'model')
    args.checkpoint_dir = os.path.join(args.path, 'checkpoint')
    args.log_file = os.path.join(args.path, 'log.txt')

    # Initialize logger
    logger = utils.init_logger(args.log_file)
    logger.info('============ Initialized logger ============')
    logger.info('Command run: %s' % ' '.join(sys.argv))
    logger.info('The experiment will be stored in %s' % args.path)

    # Log config
    logger.info('Config: %s' %
                json.dumps(vars(args), indent=2, sort_keys=True))

    return args, logger, device
예제 #2
0
def main(args):
    utils.set_seed(args.seed)
    device = utils.init_device(args.cuda)
    logger = utils.init_logger()

    logger.info('=' * 50)
    logger.info('Fine-tuning Script')
    logger.info('=' * 50)

    logger.info('Loading model...')
    model = Parser.load(args.model)
    model = model.to(device)
    dicos = model.dicos
    model.args.optimizer = args.optimizer

    args.num_lang = len(args.treebank)
    trainer = Trainer(model.args, model, device)
    evaluator = Evaluator(args, model, device)
    logger.info('=' * 50)

    logger.info('Loading data...')
    name = utils.get_name_from_dir(args.treebank)

    # We use the validation split for sourcing training sentences, as the
    # true training set is used for testing.
    train_file_map = {name: os.path.join(args.treebank, 'valid.conllu')}

    # Train data is the test data.
    test_file_map = {name: os.path.join(args.treebank, 'train.conllu')}
    train_trees = load_treebanks(train_file_map,
                                 subsample=args.N,
                                 subsampling_key='trees')
    test_trees = load_treebanks(test_file_map)
    train_iterator = get_iterator(treebank=train_trees,
                                  dicos=dicos,
                                  batch_size=args.batch_size,
                                  loop=True)
    test_iterator = get_iterator(treebank=test_trees,
                                 dicos=dicos,
                                 batch_size=500)

    logger.info('=' * 50)
    logger.info("Fine-tuning...")
    for step, inputs in tqdm.tqdm(enumerate(train_iterator, 1),
                                  total=args.steps,
                                  ncols=130):
        trainer.step(inputs)
        if step == args.steps:
            break

    logger.info("Evaluating...")
    metrics = evaluator.run_official(test_iterator, test_file_map)

    if args.output:
        utils.init_dir(args.output)
        model.save(os.path.join(args.output, 'model'))
        with open(os.path.join(args.output, 'test-metrics.json'), 'w') as f:
            json.dump(metrics, f)
예제 #3
0
def main(args):
    device = utils.init_device(args.cuda)
    logger = utils.init_logger()

    logger.info('=' * 50)
    logger.info('Evaluation Script')
    logger.info('=' * 50)

    logger.info('Loading model...')
    if args.model_type == 'baseline':
        parser_cls = Parser
        evaluator_cls = Evaluator
    elif args.model_type == 'taf':
        parser_cls = TaFParser
        evaluator_cls = TaFEvaluator
    elif args.model_type == 'tass':
        parser_cls = TaSSParser
        evaluator_cls = Evaluator
    else:
        raise ValueError('Unkown model type.')

    model = parser_cls.load(args.model).to(device)
    dicos = model.dicos

    # Make dictionary access non-strict?
    for d in dicos.values():
        d.strict = args.strict

    args.num_lang = len(args.treebanks)
    evaluator = evaluator_cls(args, model, device)

    logger.info('=' * 50)
    logger.info('Loading data...')
    file_map = {utils.get_name_from_file(f): f for f in args.treebanks}

    # Check that languages are in the dictionary. If not, add "as_lang".
    if args.as_lang:
        for l in file_map.keys():
            if l not in dicos['lang']:
                dicos['lang'].t2i[l] = dicos['lang'].t2i[args.as_lang]

    trees = load_treebanks(file_map)
    iterator = get_iterator(trees, dicos, args.batch_size)
    logger.info('=' * 50)

    logger.info("Evaluating...")
    metrics = evaluator.run_official(iterator, file_map)

    if args.output:
        logger.info('Saving results to %s' % args.output)
        dirname = os.path.dirname(args.output)
        utils.init_dir(dirname)
        with open(args.output, 'w') as f:
            json.dump(metrics, f)
예제 #4
0
def main(args):
    device = utils.init_device(args.cuda)
    logger = utils.init_logger()

    logger.info('=' * 50)
    logger.info('Prediction Script')
    logger.info('=' * 50)

    logger.info('Loading model...')
    if args.model_type == 'baseline':
        parser_cls = Parser
        evaluator_cls = Evaluator
    elif args.model_type == 'taf':
        parser_cls = TaFParser
        evaluator_cls = TaFEvaluator
    elif args.model_type == 'tass':
        parser_cls = TaSSParser
        evaluator_cls = Evaluator
    else:
        raise ValueError('Unkown model type.')

    model = parser_cls.load(args.model).to(device)
    dicos = model.dicos

    args.num_lang = len(args.treebanks)
    evaluator = evaluator_cls(args, model, device)

    logger.info('=' * 50)
    logger.info('Loading data...')
    file_map = {utils.get_name_from_file(f): f for f in args.treebanks}
    trees = load_treebanks(file_map)
    iterator = get_iterator(trees, dicos, args.batch_size)
    logger.info('=' * 50)

    logger.info("Predicting...")
    utils.init_dir(args.output)
    evaluator.dump_predictions(iterator, args.output)
예제 #5
0
def main(args):
    utils.set_seed(args.seed)
    device = utils.init_device(args.cuda)
    logger = utils.init_logger()

    logger.info('Loading model...')
    model = Parser.load(args.model).to(device)
    model.eval()
    dicos = model.dicos

    # Data is in-domain (loaded from source datasets).
    logger.info('Loading data...')
    train_file_map = {}
    valid_file_map = {}
    test_file_map = {}
    for f in model.args.src_treebanks:
        lang = utils.get_name_from_dir(f)
        basedir = os.path.join(args.treebanks, lang)
        f_train = basedir + '/train.conllu'
        train_file_map[lang] = f_train
        f_valid = basedir + '/valid.conllu'
        valid_file_map[lang] = f_valid
        f_test = basedir + '/test.conllu'
        test_file_map[lang] = f_test

    # Load up to 2k trees from each language for training.
    train_trees = load_treebanks(train_file_map,
                                 subsample=2000,
                                 subsampling_key='trees')
    train_iterator = get_iterator(train_trees, dicos, args.batch_size)

    # Load up to 500 trees from each language for validation.
    valid_trees = load_treebanks(valid_file_map,
                                 subsample=500,
                                 subsampling_key='trees')
    valid_iterator = get_iterator(valid_trees, dicos, args.batch_size)

    # Load up to 500 trees from each language for testing.
    test_trees = load_treebanks(test_file_map,
                                subsample=500,
                                subsampling_key='trees')
    test_iterator = get_iterator(test_trees, dicos, args.batch_size)

    logger.info('=' * 50)
    logger.info("Encoding...")

    train_examples, train_langs = encode(model, device, train_iterator)
    valid_examples, valid_langs = encode(model, device, valid_iterator)
    test_examples, test_langs = encode(model, device, test_iterator)
    data = {
        'X_train': train_examples.numpy(),
        'Y_train': train_langs,
        'X_valid': valid_examples.numpy(),
        'Y_valid': valid_langs,
        'X_test': test_examples.numpy(),
        'Y_test': test_langs
    }

    # Save.
    dirname = os.path.dirname(args.output)
    utils.init_dir(dirname)
    torch.save(data, args.output)