Ejemplo n.º 1
0
def main():
    parser = yaap.ArgParser(allow_config=True)
    parser.add("--vectorizer-config", type=yaap.path, required=True)
    parser.add("--senteval-data", type=yaap.path, required=True)
    parser.add("--tasks", type=str, action="append", required=True)
    parser.add("--batch-size", type=int, default=None)

    args = parser.parse_args()

    assert os.path.exists(args.vectorizer_config)

    with open(args.vectorizer_config, "r") as f:
        vec_conf = yaml.load(f)

    if args.batch_size is None:
        assert "batch-size" in vec_conf

        batch_size = vec_conf.get("batch-size")
    else:
        batch_size = args.batch_size

    sv = SentenceVectorizer(args.vectorizer_config)

    params = {
        "usepytorch": True,
        "task_path": args.senteval_data,
        "batch_size": batch_size,
        "model": sv,
    }

    se = senteval.SentEval(dotdict(params), batcher, prepare)
    se.eval(args.tasks)
Ejemplo n.º 2
0
def main(_):
    # build the model here

    if not os.path.exists(FLAGS.run_dir):
        os.makedirs(FLAGS.run_dir)

    # assert FLAGS.embed_path is not "None", "must pick a loading path"

    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.run_dir))
    logging.getLogger().addHandler(file_handler)
    embed_path = PATH_TO_GLOVE  # FLAGS.embed_path
    embed_size = FLAGS.embed_size

    params_senteval.infersent = torch.load(
        'infersent.allnli.pickle', map_location=lambda storage, loc: storage)
    params_senteval.infersent.set_glove_path(PATH_TO_GLOVE)
    params_senteval.infersent.use_cuda = False

    with open(os.path.join(FLAGS.run_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    # limit amount of GPU being used so PyTorch can use it.
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_frac)

    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
            gpu_options=gpu_options)) as session:
        tf.set_random_seed(FLAGS.seed)

        initializer = tf.random_uniform_initializer(-FLAGS.init_scale,
                                                    FLAGS.init_scale,
                                                    seed=FLAGS.seed)

        with tf.variable_scope("model", initializer=initializer):
            encoder = Encoder(size=FLAGS.state_size, num_layers=FLAGS.layers)
            sc = SequenceClassifier(session, encoder, FLAGS, embed_size,
                                    FLAGS.label_size, embed_path)

        params_senteval.discourse = sc
        params_senteval.batch_size = FLAGS.batch_size

        # restore the model here
        best_epoch = FLAGS.best_epoch
        model_saver = tf.train.Saver(max_to_keep=FLAGS.keep)

        assert FLAGS.restore_checkpoint is not None, "we must be able to reload the model"
        logging.info("restore model from best epoch %d" % best_epoch)
        checkpoint_path = pjoin(FLAGS.restore_checkpoint, "dis.ckpt")
        model_saver.restore(session, checkpoint_path + ("-%d" % best_epoch))

        se = senteval.SentEval(params_senteval, batcher, prepare)

        logging.info("evaluation starts")
        results_transfer = se.eval(transfer_tasks)

        print results_transfer
Ejemplo n.º 3
0
def main(arguments):
    parser = argparse.ArgumentParser(description=__doc__,
                    formatter_class=argparse.RawDescriptionHelpFormatter)
    # Logistics
    parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0)
    parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1)
    parser.add_argument("--log_file", help="File to log to", type=str)
    parser.add_argument("--model_file", help="File to load model from", type=str)
    parser.add_argument("--dictionary", help="File to log to", type=str,
                        default='/misc/vlgscratch4/BowmanGroup/awang/data/wikipedia/wiki_lower_small.txt.dict.pkl')
    parser.add_argument("--emb_file", help="File to load pretrained embeddings from", type=str, default='')

    # Task options
    parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str)
    parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40)
    parser.add_argument("--batch_size", help="Batch size to use", type=int, default=64)

    args = parser.parse_args(arguments)
    logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
    fileHandler = logging.FileHandler(args.log_file)
    logging.getLogger().addHandler(fileHandler)

    # Set params for SentEval
    params_senteval = {'usepytorch': True, 'task_path': PATH_TO_DATA, 'batch_size': args.batch_size}
    params_senteval = dotdict(params_senteval)

    # Build model
    use_preemb = False
    if args.emb_file:
        use_preemb = True
    model, model_options, worddict, wv_embs = \
            sdae.load_model(saveto=args.model_file,
                            dictionary=args.dictionary,
                            embeddings=args.emb_file,
                            reload_=True, use_preemb=use_preemb)
    params_senteval.encoder = model
    params_senteval.model_options = model_options
    params_senteval.worddict = worddict
    params_senteval.wv_embs = wv_embs

    se = senteval.SentEval(params_senteval, batcher, prepare)
    tasks = args.tasks.split(',')
    results = se.eval(tasks)
    print(results)
Ejemplo n.º 4
0
def main(arguments):
    parser = argparse.ArgumentParser(description=__doc__,
                    formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int,
            default=1)
    parser.add_argument("--log_file", help="File to log to", type=str)
    parser.add_argument("--model_file", help="File containing trained model",
                        type=str)
    parser.add_argument("--small", help="Use small training data if available", type=int, default=1)
    parser.add_argument("--lower", help="Lower case data", type=int, default=0)

    args = parser.parse_args(arguments)

    # Set params for SentEval
    params_senteval = {'usepytorch': True,
                       'task_path': PATH_TO_DATA,
                       'batch_size': 512}
    params_senteval = dotdict(params_senteval)

    # Set up logger
    logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
    fileHandler = logging.FileHandler(args.log_file)
    logging.getLogger().addHandler(fileHandler)

    #params_senteval.encoder = pkl.load(open(args.model_file, 'rb'))
    params_senteval.encoder = FastSent.load(args.model_file)
    se = senteval.SentEval(params_senteval, batcher, prepare)
    '''
    tasks = ['MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC',
             'SICKRelatedness', 'SICKEntailment', 'MRPC',
             'STS14', 'SQuAD', 'Quora']
    '''
    tasks = ['Quora', 'Reasoning']

    se.eval(tasks, small=args.small, lower=args.lower)
Ejemplo n.º 5
0
                'fc_dim': 512,
                'bsize': 32,
                'pool_type': 'max',
                'encoder_type': 'BLSTMEncoder',
                'tied_weights': False,
                'use_cuda': True,
            }
            if params.random:
                # initialize randomly
                logging.info("initialize network randomly")
                params_senteval.infersent = BLSTMEncoder(config_dis_model)
            else:
                params_senteval.infersent = AVGEncoder(config_dis_model)
        params_senteval.infersent.set_glove_path(GLOVE_PATH)

        se = senteval.SentEval(params_senteval, batcher, prepare)
        results_transfer = se.eval(transfer_tasks)

        logging.info(results_transfer)
    else:
        filtered_epoch_numbers = filter(lambda i: params.search_start_epoch <= i <= params.search_end_epoch,
                                        epoch_numbers)
        assert len(
            filtered_epoch_numbers) >= 1, "the epoch search criteria [{}, {}] returns null, available epochs are: {}".format(
            params.search_start_epoch, params.search_end_epoch, epoch_numbers)

        first = True
        for epoch in filtered_epoch_numbers:
            logging.info("******* Epoch {} Evaluation *******".format(epoch))
            model_name = params.outputmodelname + '-{}.pickle'.format(epoch)
            model_path = pjoin(params.outputdir, model_name)
                     level=logging.DEBUG,
                     filename=log_file)
 logging.info("ARGUMENTS<<<<<")
 for arg, value in sorted(vars(options).items()):
     print arg, value
     logging.info("Argument %s: %r", arg, value)
 logging.info(">>>>>ARGUMENTS")
 # config for transfer tasks
 if options.random:
     params_senteval = DotDict({
         'usepytorch':
         True,
         'task_path':
         pjoin(SENTEVAL_PATH, 'data/senteval_data')
     })
     evaluator = senteval.SentEval(params_senteval, batcher_random, prepare)
 elif options.bow:
     params_senteval = DotDict({
         'usepytorch':
         True,
         'transfer_tasks': [
             'MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC', 'SICKRelatedness',
             'SICKEntailment', 'MRPC', 'STS14'
         ],
         'task_path':
         pjoin(SENTEVAL_PATH, 'data/senteval_data'),
         'w2v':
         load_w2v(options.word_embedding, options.cut_voc)
     })
     evaluator = senteval.SentEval(params_senteval, batcher_bow, prepare)
 else:
Ejemplo n.º 7
0

"""
Evaluation of trained model on Transfer Tasks (SentEval)
"""

# define transfer tasks
transfer_tasks = ['MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC', 'SICKRelatedness',\
                  'SICKEntailment', 'MRPC', 'STS14']

# define senteval params
params_senteval = dotdict({
    'usepytorch': True,
    'task_path': PATH_TO_DATA,
    'seed': 1111,
    'kfold': 5
})

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    # Load model
    params_senteval.infersent = torch.load(MODEL_PATH)
    params_senteval.infersent.set_glove_path(GLOVE_PATH)

    se = senteval.SentEval(batcher, prepare, params_senteval)
    results_transfer = se.eval(transfer_tasks)

    print results_transfer
Ejemplo n.º 8
0
def main(arguments):
    parser = argparse.ArgumentParser(description=__doc__,
                    formatter_class=argparse.RawDescriptionHelpFormatter)

    # Logistics
    parser = argparse.ArgumentParser(description='DisSent SentEval Evaluation')
    parser.add_argument("--seed", help="Random seed", type=int, default=19)
    parser.add_argument("--gpu_id", type=int, default=0, help="GPU ID, we map all model's gpu to this id")
    parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1)
    parser.add_argument("--log_file", help="File to log to", type=str)
    parser.add_argument("--load_data", help="0 to read data from scratch", type=int, default=1)
    parser.add_argument("--out_dir", help="Dir to write preds to", type=str, default='')

    # Task options
    parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str)
    parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40)

    # Model options
    parser.add_argument("--word_vec_file", type=str)
    parser.add_argument("--model_dir", type=str, help="Directory containing model snapshots")
    parser.add_argument("--outputmodelname", type=str, default='dis-model')
    parser.add_argument("--search_start_epoch", type=int, default=-1, help="Search from [start, end] epochs ")
    parser.add_argument("--search_end_epoch", type=int, default=-1, help="Search from [start, end] epochs")
    parser.add_argument("--batch_size", help="Batch size to use", type=int, default=64)

    # Classifier options
    parser.add_argument("--cls_batch_size", help="Batch size to use", type=int, default=64)

    args = parser.parse_args(arguments)
    logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)
    log_file = os.path.join(args.out_dir, "results.log")
    file_handler = logging.FileHandler(log_file)
    logging.getLogger().addHandler(file_handler)
    logging.info(args)

    # define senteval params
    params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': args.use_pytorch, 'kfold': 10,
                       'max_seq_len': args.max_seq_len, 'batch_size': args.batch_size,
                       'load_data': args.load_data, 'seed': args.seed}
    params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': args.cls_batch_size,
                                     'tenacity': 5, 'epoch_size': 4, 'cudaEfficient': args.gpu_id > 0}

    # set gpu device
    torch.cuda.set_device(args.gpu_id)
    # We map cuda to the current cuda device, this only works when we set args.gpu_id = 0
    map_locations = {}
    for d in range(4):
        if d != args.gpu_id:
            map_locations['cuda:{}'.format(d)] = "cuda:{}".format(args.gpu_id)

    tasks = get_tasks(args.tasks)

    # collect number of epochs trained in directory
    model_files = filter(lambda s: args.outputmodelname + '-' in s and 'encoder' not in s,
                         os.listdir(args.model_dir))
    epoch_numbers = map(lambda s: s.split(args.outputmodelname + '-')[1].replace('.pickle', ''), model_files)
    # ['8', '7', '9', '3', '11', '2', '1', '5', '4', '6']
    # this is discontinuous :)
    #epoch_numbers = map(lambda i: int(i), epoch_numbers)
    epoch_numbers = map(int, epoch_numbers)
    epoch_numbers = sorted(epoch_numbers)  # now sorted

    # original setting
    if args.search_start_epoch == -1 or args.search_end_epoch == -1:
        # Load model
        MODEL_PATH = pjoin(args.model_dir, args.outputmodelname + ".pickle.encoder")

        params_senteval['infersent'] = torch.load(MODEL_PATH, map_location=map_locations)
        params_senteval['infersent'].set_glove_path(args.word_vec_file)

        se = senteval.engine.SE(params_senteval, batcher, prepare)
        results = se.eval(tasks)
        write_results(results, args.out_dir)
        logging.info(results)
    else:
        # search through all epochs
        filtered_epoch_numbers = filter(lambda i: args.search_start_epoch <= i <= args.search_end_epoch,
                                        epoch_numbers)
        assert len(filtered_epoch_numbers) >= 1, \
                "the epoch search criteria [{}, {}] returns null, available epochs are: {}".format(
                args.search_start_epoch, args.search_end_epoch, epoch_numbers)

        for epoch in filtered_epoch_numbers:
            logging.info("******* Epoch {} Evaluation *******".format(epoch))
            model_name = args.outputmodelname + '-{}.pickle'.format(epoch)
            model_path = pjoin(args.model_dir, model_name)

            dissent = torch.load(model_path, map_location=map_locations)
            if args.gpu_id > -1:
                dissent = dissent.cuda()
            params_senteval['infersent'] = dissent.encoder  # this might be good enough
            params_senteval['infersent'].set_glove_path(args.word_vec_file)

            se = senteval.SentEval(params_senteval, batcher, prepare)
            results = se.eval(tasks)
            write_results(results, args.out_dir)
            logging.info(results)
Ejemplo n.º 9
0
def main(_):
    # build the model here

    if not os.path.exists(FLAGS.run_dir):
        os.makedirs(FLAGS.run_dir)

    # assert FLAGS.embed_path is not "None", "must pick a loading path"

    file_handler = logging.FileHandler("{0}/log.txt".format(FLAGS.run_dir))
    logging.getLogger().addHandler(file_handler)
    embed_path = PATH_TO_GLOVE  # FLAGS.embed_path
    embed_size = FLAGS.embed_size

    with open(os.path.join(FLAGS.run_dir, "flags.json"), 'w') as fout:
        json.dump(FLAGS.__flags, fout)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)

    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
            gpu_options=gpu_options)) as session:
        # config = tf.ConfigProto(allow_soft_placement=True)
        tf.set_random_seed(FLAGS.seed)

        initializer = tf.random_uniform_initializer(-FLAGS.init_scale,
                                                    FLAGS.init_scale,
                                                    seed=FLAGS.seed)

        with tf.variable_scope("discourse",
                               reuse=None,
                               initializer=initializer):
            encoder = Encoder(size=FLAGS.state_size, num_layers=FLAGS.layers)
            discourse_sc = SequenceClassifier(session, encoder, FLAGS,
                                              embed_size, FLAGS.label_size,
                                              embed_path)

        with tf.variable_scope("snli", reuse=None, initializer=initializer):
            # preparation for SNLI
            snli_encoder = Encoder(size=FLAGS.state_size,
                                   num_layers=FLAGS.layers)
            snli_sc = SequenceClassifier(session, snli_encoder, FLAGS,
                                         embed_size, FLAGS.label_size,
                                         embed_path)

        params_senteval.discourse = discourse_sc
        params_senteval.snli = snli_sc
        params_senteval.batch_size = FLAGS.batch_size

        # restore the model here
        # (two models are stored together)
        best_epoch = FLAGS.best_epoch
        model_saver = tf.train.Saver(max_to_keep=FLAGS.keep)

        assert FLAGS.restore_checkpoint is not None, "we must be able to reload the model"
        logging.info("restore model from best epoch %d" % best_epoch)
        checkpoint_path = pjoin(FLAGS.restore_checkpoint, "dis.ckpt")
        model_saver.restore(session, checkpoint_path + ("-%d" % best_epoch))

        se = senteval.SentEval(params_senteval, batcher, prepare)
        results_transfer = se.eval(transfer_tasks)

        print results_transfer
Ejemplo n.º 10
0
def main(arguments):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # Logistics
    parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0)
    parser.add_argument("--use_pytorch",
                        help="1 to use PyTorch",
                        type=int,
                        default=1)
    parser.add_argument("--log_file", help="File to log to", type=str)
    parser.add_argument("--model_file",
                        help="File to load model from",
                        type=str)
    parser.add_argument("--dict_file", help="File to load dict from", type=str)

    # Task options
    parser.add_argument("--tasks",
                        help="Tasks to evaluate on, as a comma separated list",
                        type=str)
    parser.add_argument("--max_seq_len",
                        help="Max sequence length",
                        type=int,
                        default=40)

    # Model options
    parser.add_argument("--batch_size",
                        help="Batch size to use",
                        type=int,
                        default=32)

    # Classifier options
    parser.add_argument("--cls_batch_size",
                        help="Batch size to use for classifier",
                        type=int,
                        default=32)

    args = parser.parse_args(arguments)
    logging.basicConfig(format='%(asctime)s : %(message)s',
                        level=logging.DEBUG)
    fileHandler = logging.FileHandler(args.log_file)
    logging.getLogger().addHandler(fileHandler)

    # Set params for SentEval
    params_senteval = {
        'usepytorch': True,
        'task_path': PATH_TO_DATA,
        'max_seq_len': args.max_seq_len,
        'batch_size': args.batch_size
    }
    params_senteval['classifier'] = {
        'nhid': 0,
        'optim': 'adam',
        'batch_size': args.cls_batch_size,
        'tenacity': 5,
        'epoch_size': 4
    }
    params_senteval = dotdict(params_senteval)

    with open(args.dict_file, 'rb') as fh:
        data = pkl.load(fh)
        word2idx = data[0]
    word2idx['<pad>'] = len(word2idx)
    n_words = len(word2idx)

    # Load model
    params_senteval.encoder = convsent.load_model(args.model_file,
                                                  n_words=n_words)
    params_senteval.word2idx = word2idx

    se = senteval.SentEval(params_senteval, batcher, prepare)
    tasks = args.tasks.split(',')
    results = se.eval(tasks)
    print(results)