def main(): parser = yaap.ArgParser(allow_config=True) parser.add("--vectorizer-config", type=yaap.path, required=True) parser.add("--senteval-data", type=yaap.path, required=True) parser.add("--tasks", type=str, action="append", required=True) parser.add("--batch-size", type=int, default=None) args = parser.parse_args() assert os.path.exists(args.vectorizer_config) with open(args.vectorizer_config, "r") as f: vec_conf = yaml.load(f) if args.batch_size is None: assert "batch-size" in vec_conf batch_size = vec_conf.get("batch-size") else: batch_size = args.batch_size sv = SentenceVectorizer(args.vectorizer_config) params = { "usepytorch": True, "task_path": args.senteval_data, "batch_size": batch_size, "model": sv, } se = senteval.SentEval(dotdict(params), batcher, prepare) se.eval(args.tasks)
def main(_): # build the model here if not os.path.exists(FLAGS.run_dir): os.makedirs(FLAGS.run_dir) # assert FLAGS.embed_path is not "None", "must pick a loading path" file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.run_dir)) logging.getLogger().addHandler(file_handler) embed_path = PATH_TO_GLOVE # FLAGS.embed_path embed_size = FLAGS.embed_size params_senteval.infersent = torch.load( 'infersent.allnli.pickle', map_location=lambda storage, loc: storage) params_senteval.infersent.set_glove_path(PATH_TO_GLOVE) params_senteval.infersent.use_cuda = False with open(os.path.join(FLAGS.run_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # limit amount of GPU being used so PyTorch can use it. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_frac) with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as session: tf.set_random_seed(FLAGS.seed) initializer = tf.random_uniform_initializer(-FLAGS.init_scale, FLAGS.init_scale, seed=FLAGS.seed) with tf.variable_scope("model", initializer=initializer): encoder = Encoder(size=FLAGS.state_size, num_layers=FLAGS.layers) sc = SequenceClassifier(session, encoder, FLAGS, embed_size, FLAGS.label_size, embed_path) params_senteval.discourse = sc params_senteval.batch_size = FLAGS.batch_size # restore the model here best_epoch = FLAGS.best_epoch model_saver = tf.train.Saver(max_to_keep=FLAGS.keep) assert FLAGS.restore_checkpoint is not None, "we must be able to reload the model" logging.info("restore model from best epoch %d" % best_epoch) checkpoint_path = pjoin(FLAGS.restore_checkpoint, "dis.ckpt") model_saver.restore(session, checkpoint_path + ("-%d" % best_epoch)) se = senteval.SentEval(params_senteval, batcher, prepare) logging.info("evaluation starts") results_transfer = se.eval(transfer_tasks) print results_transfer
def main(arguments): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # Logistics parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0) parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1) parser.add_argument("--log_file", help="File to log to", type=str) parser.add_argument("--model_file", help="File to load model from", type=str) parser.add_argument("--dictionary", help="File to log to", type=str, default='/misc/vlgscratch4/BowmanGroup/awang/data/wikipedia/wiki_lower_small.txt.dict.pkl') parser.add_argument("--emb_file", help="File to load pretrained embeddings from", type=str, default='') # Task options parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str) parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40) parser.add_argument("--batch_size", help="Batch size to use", type=int, default=64) args = parser.parse_args(arguments) logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) fileHandler = logging.FileHandler(args.log_file) logging.getLogger().addHandler(fileHandler) # Set params for SentEval params_senteval = {'usepytorch': True, 'task_path': PATH_TO_DATA, 'batch_size': args.batch_size} params_senteval = dotdict(params_senteval) # Build model use_preemb = False if args.emb_file: use_preemb = True model, model_options, worddict, wv_embs = \ sdae.load_model(saveto=args.model_file, dictionary=args.dictionary, embeddings=args.emb_file, reload_=True, use_preemb=use_preemb) params_senteval.encoder = model params_senteval.model_options = model_options params_senteval.worddict = worddict params_senteval.wv_embs = wv_embs se = senteval.SentEval(params_senteval, batcher, prepare) tasks = args.tasks.split(',') results = se.eval(tasks) print(results)
def main(arguments): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1) parser.add_argument("--log_file", help="File to log to", type=str) parser.add_argument("--model_file", help="File containing trained model", type=str) parser.add_argument("--small", help="Use small training data if available", type=int, default=1) parser.add_argument("--lower", help="Lower case data", type=int, default=0) args = parser.parse_args(arguments) # Set params for SentEval params_senteval = {'usepytorch': True, 'task_path': PATH_TO_DATA, 'batch_size': 512} params_senteval = dotdict(params_senteval) # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) fileHandler = logging.FileHandler(args.log_file) logging.getLogger().addHandler(fileHandler) #params_senteval.encoder = pkl.load(open(args.model_file, 'rb')) params_senteval.encoder = FastSent.load(args.model_file) se = senteval.SentEval(params_senteval, batcher, prepare) ''' tasks = ['MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC', 'SICKRelatedness', 'SICKEntailment', 'MRPC', 'STS14', 'SQuAD', 'Quora'] ''' tasks = ['Quora', 'Reasoning'] se.eval(tasks, small=args.small, lower=args.lower)
'fc_dim': 512, 'bsize': 32, 'pool_type': 'max', 'encoder_type': 'BLSTMEncoder', 'tied_weights': False, 'use_cuda': True, } if params.random: # initialize randomly logging.info("initialize network randomly") params_senteval.infersent = BLSTMEncoder(config_dis_model) else: params_senteval.infersent = AVGEncoder(config_dis_model) params_senteval.infersent.set_glove_path(GLOVE_PATH) se = senteval.SentEval(params_senteval, batcher, prepare) results_transfer = se.eval(transfer_tasks) logging.info(results_transfer) else: filtered_epoch_numbers = filter(lambda i: params.search_start_epoch <= i <= params.search_end_epoch, epoch_numbers) assert len( filtered_epoch_numbers) >= 1, "the epoch search criteria [{}, {}] returns null, available epochs are: {}".format( params.search_start_epoch, params.search_end_epoch, epoch_numbers) first = True for epoch in filtered_epoch_numbers: logging.info("******* Epoch {} Evaluation *******".format(epoch)) model_name = params.outputmodelname + '-{}.pickle'.format(epoch) model_path = pjoin(params.outputdir, model_name)
level=logging.DEBUG, filename=log_file) logging.info("ARGUMENTS<<<<<") for arg, value in sorted(vars(options).items()): print arg, value logging.info("Argument %s: %r", arg, value) logging.info(">>>>>ARGUMENTS") # config for transfer tasks if options.random: params_senteval = DotDict({ 'usepytorch': True, 'task_path': pjoin(SENTEVAL_PATH, 'data/senteval_data') }) evaluator = senteval.SentEval(params_senteval, batcher_random, prepare) elif options.bow: params_senteval = DotDict({ 'usepytorch': True, 'transfer_tasks': [ 'MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC', 'SICKRelatedness', 'SICKEntailment', 'MRPC', 'STS14' ], 'task_path': pjoin(SENTEVAL_PATH, 'data/senteval_data'), 'w2v': load_w2v(options.word_embedding, options.cut_voc) }) evaluator = senteval.SentEval(params_senteval, batcher_bow, prepare) else:
""" Evaluation of trained model on Transfer Tasks (SentEval) """ # define transfer tasks transfer_tasks = ['MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC', 'SICKRelatedness',\ 'SICKEntailment', 'MRPC', 'STS14'] # define senteval params params_senteval = dotdict({ 'usepytorch': True, 'task_path': PATH_TO_DATA, 'seed': 1111, 'kfold': 5 }) # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": # Load model params_senteval.infersent = torch.load(MODEL_PATH) params_senteval.infersent.set_glove_path(GLOVE_PATH) se = senteval.SentEval(batcher, prepare, params_senteval) results_transfer = se.eval(transfer_tasks) print results_transfer
def main(arguments): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # Logistics parser = argparse.ArgumentParser(description='DisSent SentEval Evaluation') parser.add_argument("--seed", help="Random seed", type=int, default=19) parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID, we map all model's gpu to this id") parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1) parser.add_argument("--log_file", help="File to log to", type=str) parser.add_argument("--load_data", help="0 to read data from scratch", type=int, default=1) parser.add_argument("--out_dir", help="Dir to write preds to", type=str, default='') # Task options parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str) parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40) # Model options parser.add_argument("--word_vec_file", type=str) parser.add_argument("--model_dir", type=str, help="Directory containing model snapshots") parser.add_argument("--outputmodelname", type=str, default='dis-model') parser.add_argument("--search_start_epoch", type=int, default=-1, help="Search from [start, end] epochs ") parser.add_argument("--search_end_epoch", type=int, default=-1, help="Search from [start, end] epochs") parser.add_argument("--batch_size", help="Batch size to use", type=int, default=64) # Classifier options parser.add_argument("--cls_batch_size", help="Batch size to use", type=int, default=64) args = parser.parse_args(arguments) logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) log_file = os.path.join(args.out_dir, "results.log") file_handler = logging.FileHandler(log_file) logging.getLogger().addHandler(file_handler) logging.info(args) # define senteval params params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': args.use_pytorch, 'kfold': 10, 'max_seq_len': args.max_seq_len, 'batch_size': args.batch_size, 'load_data': args.load_data, 'seed': args.seed} params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': args.cls_batch_size, 'tenacity': 5, 'epoch_size': 4, 'cudaEfficient': args.gpu_id > 0} # set gpu device torch.cuda.set_device(args.gpu_id) # We map cuda to the current cuda device, this only works when we set args.gpu_id = 0 map_locations = {} for d in range(4): if d != args.gpu_id: map_locations['cuda:{}'.format(d)] = "cuda:{}".format(args.gpu_id) tasks = get_tasks(args.tasks) # collect number of epochs trained in directory model_files = filter(lambda s: args.outputmodelname + '-' in s and 'encoder' not in s, os.listdir(args.model_dir)) epoch_numbers = map(lambda s: s.split(args.outputmodelname + '-')[1].replace('.pickle', ''), model_files) # ['8', '7', '9', '3', '11', '2', '1', '5', '4', '6'] # this is discontinuous :) #epoch_numbers = map(lambda i: int(i), epoch_numbers) epoch_numbers = map(int, epoch_numbers) epoch_numbers = sorted(epoch_numbers) # now sorted # original setting if args.search_start_epoch == -1 or args.search_end_epoch == -1: # Load model MODEL_PATH = pjoin(args.model_dir, args.outputmodelname + ".pickle.encoder") params_senteval['infersent'] = torch.load(MODEL_PATH, map_location=map_locations) params_senteval['infersent'].set_glove_path(args.word_vec_file) se = senteval.engine.SE(params_senteval, batcher, prepare) results = se.eval(tasks) write_results(results, args.out_dir) logging.info(results) else: # search through all epochs filtered_epoch_numbers = filter(lambda i: args.search_start_epoch <= i <= args.search_end_epoch, epoch_numbers) assert len(filtered_epoch_numbers) >= 1, \ "the epoch search criteria [{}, {}] returns null, available epochs are: {}".format( args.search_start_epoch, args.search_end_epoch, epoch_numbers) for epoch in filtered_epoch_numbers: logging.info("******* Epoch {} Evaluation *******".format(epoch)) model_name = args.outputmodelname + '-{}.pickle'.format(epoch) model_path = pjoin(args.model_dir, model_name) dissent = torch.load(model_path, map_location=map_locations) if args.gpu_id > -1: dissent = dissent.cuda() params_senteval['infersent'] = dissent.encoder # this might be good enough params_senteval['infersent'].set_glove_path(args.word_vec_file) se = senteval.SentEval(params_senteval, batcher, prepare) results = se.eval(tasks) write_results(results, args.out_dir) logging.info(results)
def main(_): # build the model here if not os.path.exists(FLAGS.run_dir): os.makedirs(FLAGS.run_dir) # assert FLAGS.embed_path is not "None", "must pick a loading path" file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.run_dir)) logging.getLogger().addHandler(file_handler) embed_path = PATH_TO_GLOVE # FLAGS.embed_path embed_size = FLAGS.embed_size with open(os.path.join(FLAGS.run_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as session: # config = tf.ConfigProto(allow_soft_placement=True) tf.set_random_seed(FLAGS.seed) initializer = tf.random_uniform_initializer(-FLAGS.init_scale, FLAGS.init_scale, seed=FLAGS.seed) with tf.variable_scope("discourse", reuse=None, initializer=initializer): encoder = Encoder(size=FLAGS.state_size, num_layers=FLAGS.layers) discourse_sc = SequenceClassifier(session, encoder, FLAGS, embed_size, FLAGS.label_size, embed_path) with tf.variable_scope("snli", reuse=None, initializer=initializer): # preparation for SNLI snli_encoder = Encoder(size=FLAGS.state_size, num_layers=FLAGS.layers) snli_sc = SequenceClassifier(session, snli_encoder, FLAGS, embed_size, FLAGS.label_size, embed_path) params_senteval.discourse = discourse_sc params_senteval.snli = snli_sc params_senteval.batch_size = FLAGS.batch_size # restore the model here # (two models are stored together) best_epoch = FLAGS.best_epoch model_saver = tf.train.Saver(max_to_keep=FLAGS.keep) assert FLAGS.restore_checkpoint is not None, "we must be able to reload the model" logging.info("restore model from best epoch %d" % best_epoch) checkpoint_path = pjoin(FLAGS.restore_checkpoint, "dis.ckpt") model_saver.restore(session, checkpoint_path + ("-%d" % best_epoch)) se = senteval.SentEval(params_senteval, batcher, prepare) results_transfer = se.eval(transfer_tasks) print results_transfer
def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # Logistics parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0) parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1) parser.add_argument("--log_file", help="File to log to", type=str) parser.add_argument("--model_file", help="File to load model from", type=str) parser.add_argument("--dict_file", help="File to load dict from", type=str) # Task options parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str) parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40) # Model options parser.add_argument("--batch_size", help="Batch size to use", type=int, default=32) # Classifier options parser.add_argument("--cls_batch_size", help="Batch size to use for classifier", type=int, default=32) args = parser.parse_args(arguments) logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) fileHandler = logging.FileHandler(args.log_file) logging.getLogger().addHandler(fileHandler) # Set params for SentEval params_senteval = { 'usepytorch': True, 'task_path': PATH_TO_DATA, 'max_seq_len': args.max_seq_len, 'batch_size': args.batch_size } params_senteval['classifier'] = { 'nhid': 0, 'optim': 'adam', 'batch_size': args.cls_batch_size, 'tenacity': 5, 'epoch_size': 4 } params_senteval = dotdict(params_senteval) with open(args.dict_file, 'rb') as fh: data = pkl.load(fh) word2idx = data[0] word2idx['<pad>'] = len(word2idx) n_words = len(word2idx) # Load model params_senteval.encoder = convsent.load_model(args.model_file, n_words=n_words) params_senteval.word2idx = word2idx se = senteval.SentEval(params_senteval, batcher, prepare) tasks = args.tasks.split(',') results = se.eval(tasks) print(results)