def init_experiment(args): """Initialize experiment.""" # Check for CUDA device = utils.init_device(args.cuda) # Set random seed for reproducibility if getattr(args, 'seed', -1) >= 0: utils.set_seed(args.seed) # Setup experiment directory if not args.experiment_dir: args.experiment_dir = os.path.join('results/adhoc', time.strftime('%Y%m%d')) if not args.experiment_name: args.experiment_name = str(uuid.uuid4())[:8] args.path = os.path.join(args.experiment_dir, args.experiment_name) utils.init_dir(args.path) # Filenames inside experiment directory args.model_dir = os.path.join(args.path, 'model') args.checkpoint_dir = os.path.join(args.path, 'checkpoint') args.log_file = os.path.join(args.path, 'log.txt') # Initialize logger logger = utils.init_logger(args.log_file) logger.info('============ Initialized logger ============') logger.info('Command run: %s' % ' '.join(sys.argv)) logger.info('The experiment will be stored in %s' % args.path) # Log config logger.info('Config: %s' % json.dumps(vars(args), indent=2, sort_keys=True)) return args, logger, device
def main(args): utils.set_seed(args.seed) device = utils.init_device(args.cuda) logger = utils.init_logger() logger.info('=' * 50) logger.info('Fine-tuning Script') logger.info('=' * 50) logger.info('Loading model...') model = Parser.load(args.model) model = model.to(device) dicos = model.dicos model.args.optimizer = args.optimizer args.num_lang = len(args.treebank) trainer = Trainer(model.args, model, device) evaluator = Evaluator(args, model, device) logger.info('=' * 50) logger.info('Loading data...') name = utils.get_name_from_dir(args.treebank) # We use the validation split for sourcing training sentences, as the # true training set is used for testing. train_file_map = {name: os.path.join(args.treebank, 'valid.conllu')} # Train data is the test data. test_file_map = {name: os.path.join(args.treebank, 'train.conllu')} train_trees = load_treebanks(train_file_map, subsample=args.N, subsampling_key='trees') test_trees = load_treebanks(test_file_map) train_iterator = get_iterator(treebank=train_trees, dicos=dicos, batch_size=args.batch_size, loop=True) test_iterator = get_iterator(treebank=test_trees, dicos=dicos, batch_size=500) logger.info('=' * 50) logger.info("Fine-tuning...") for step, inputs in tqdm.tqdm(enumerate(train_iterator, 1), total=args.steps, ncols=130): trainer.step(inputs) if step == args.steps: break logger.info("Evaluating...") metrics = evaluator.run_official(test_iterator, test_file_map) if args.output: utils.init_dir(args.output) model.save(os.path.join(args.output, 'model')) with open(os.path.join(args.output, 'test-metrics.json'), 'w') as f: json.dump(metrics, f)
def main(args): device = utils.init_device(args.cuda) logger = utils.init_logger() logger.info('=' * 50) logger.info('Evaluation Script') logger.info('=' * 50) logger.info('Loading model...') if args.model_type == 'baseline': parser_cls = Parser evaluator_cls = Evaluator elif args.model_type == 'taf': parser_cls = TaFParser evaluator_cls = TaFEvaluator elif args.model_type == 'tass': parser_cls = TaSSParser evaluator_cls = Evaluator else: raise ValueError('Unkown model type.') model = parser_cls.load(args.model).to(device) dicos = model.dicos # Make dictionary access non-strict? for d in dicos.values(): d.strict = args.strict args.num_lang = len(args.treebanks) evaluator = evaluator_cls(args, model, device) logger.info('=' * 50) logger.info('Loading data...') file_map = {utils.get_name_from_file(f): f for f in args.treebanks} # Check that languages are in the dictionary. If not, add "as_lang". if args.as_lang: for l in file_map.keys(): if l not in dicos['lang']: dicos['lang'].t2i[l] = dicos['lang'].t2i[args.as_lang] trees = load_treebanks(file_map) iterator = get_iterator(trees, dicos, args.batch_size) logger.info('=' * 50) logger.info("Evaluating...") metrics = evaluator.run_official(iterator, file_map) if args.output: logger.info('Saving results to %s' % args.output) dirname = os.path.dirname(args.output) utils.init_dir(dirname) with open(args.output, 'w') as f: json.dump(metrics, f)
def main(args): device = utils.init_device(args.cuda) logger = utils.init_logger() logger.info('=' * 50) logger.info('Prediction Script') logger.info('=' * 50) logger.info('Loading model...') if args.model_type == 'baseline': parser_cls = Parser evaluator_cls = Evaluator elif args.model_type == 'taf': parser_cls = TaFParser evaluator_cls = TaFEvaluator elif args.model_type == 'tass': parser_cls = TaSSParser evaluator_cls = Evaluator else: raise ValueError('Unkown model type.') model = parser_cls.load(args.model).to(device) dicos = model.dicos args.num_lang = len(args.treebanks) evaluator = evaluator_cls(args, model, device) logger.info('=' * 50) logger.info('Loading data...') file_map = {utils.get_name_from_file(f): f for f in args.treebanks} trees = load_treebanks(file_map) iterator = get_iterator(trees, dicos, args.batch_size) logger.info('=' * 50) logger.info("Predicting...") utils.init_dir(args.output) evaluator.dump_predictions(iterator, args.output)
def main(args): utils.set_seed(args.seed) device = utils.init_device(args.cuda) logger = utils.init_logger() logger.info('Loading model...') model = Parser.load(args.model).to(device) model.eval() dicos = model.dicos # Data is in-domain (loaded from source datasets). logger.info('Loading data...') train_file_map = {} valid_file_map = {} test_file_map = {} for f in model.args.src_treebanks: lang = utils.get_name_from_dir(f) basedir = os.path.join(args.treebanks, lang) f_train = basedir + '/train.conllu' train_file_map[lang] = f_train f_valid = basedir + '/valid.conllu' valid_file_map[lang] = f_valid f_test = basedir + '/test.conllu' test_file_map[lang] = f_test # Load up to 2k trees from each language for training. train_trees = load_treebanks(train_file_map, subsample=2000, subsampling_key='trees') train_iterator = get_iterator(train_trees, dicos, args.batch_size) # Load up to 500 trees from each language for validation. valid_trees = load_treebanks(valid_file_map, subsample=500, subsampling_key='trees') valid_iterator = get_iterator(valid_trees, dicos, args.batch_size) # Load up to 500 trees from each language for testing. test_trees = load_treebanks(test_file_map, subsample=500, subsampling_key='trees') test_iterator = get_iterator(test_trees, dicos, args.batch_size) logger.info('=' * 50) logger.info("Encoding...") train_examples, train_langs = encode(model, device, train_iterator) valid_examples, valid_langs = encode(model, device, valid_iterator) test_examples, test_langs = encode(model, device, test_iterator) data = { 'X_train': train_examples.numpy(), 'Y_train': train_langs, 'X_valid': valid_examples.numpy(), 'Y_valid': valid_langs, 'X_test': test_examples.numpy(), 'Y_test': test_langs } # Save. dirname = os.path.dirname(args.output) utils.init_dir(dirname) torch.save(data, args.output)