Ejemplo n.º 1
0
def main(arguments):
    parser = argparse.ArgumentParser(description=__doc__,
                    formatter_class=argparse.RawDescriptionHelpFormatter)
    # Logistics
    parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0)
    parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1)
    parser.add_argument("--log_file", help="File to log to", type=str)
    parser.add_argument("--model_file", help="File to load model from", type=str)
    parser.add_argument("--dictionary", help="File to log to", type=str,
                        default='/misc/vlgscratch4/BowmanGroup/awang/data/wikipedia/wiki_lower_small.txt.dict.pkl')
    parser.add_argument("--emb_file", help="File to load pretrained embeddings from", type=str, default='')

    # Task options
    parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str)
    parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40)
    parser.add_argument("--batch_size", help="Batch size to use", type=int, default=64)

    args = parser.parse_args(arguments)
    logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
    fileHandler = logging.FileHandler(args.log_file)
    logging.getLogger().addHandler(fileHandler)

    # Set params for SentEval
    params_senteval = {'usepytorch': True, 'task_path': PATH_TO_DATA, 'batch_size': args.batch_size}
    params_senteval = dotdict(params_senteval)

    # Build model
    use_preemb = False
    if args.emb_file:
        use_preemb = True
    model, model_options, worddict, wv_embs = \
            sdae.load_model(saveto=args.model_file,
                            dictionary=args.dictionary,
                            embeddings=args.emb_file,
                            reload_=True, use_preemb=use_preemb)
    params_senteval.encoder = model
    params_senteval.model_options = model_options
    params_senteval.worddict = worddict
    params_senteval.wv_embs = wv_embs

    se = senteval.SentEval(params_senteval, batcher, prepare)
    tasks = args.tasks.split(',')
    results = se.eval(tasks)
    print(results)
Ejemplo n.º 2
0
def main(arguments):
    parser = argparse.ArgumentParser(description=__doc__,
                    formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int,
            default=1)
    parser.add_argument("--log_file", help="File to log to", type=str)
    parser.add_argument("--model_file", help="File containing trained model",
                        type=str)
    parser.add_argument("--small", help="Use small training data if available", type=int, default=1)
    parser.add_argument("--lower", help="Lower case data", type=int, default=0)

    args = parser.parse_args(arguments)

    # Set params for SentEval
    params_senteval = {'usepytorch': True,
                       'task_path': PATH_TO_DATA,
                       'batch_size': 512}
    params_senteval = dotdict(params_senteval)

    # Set up logger
    logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
    fileHandler = logging.FileHandler(args.log_file)
    logging.getLogger().addHandler(fileHandler)

    #params_senteval.encoder = pkl.load(open(args.model_file, 'rb'))
    params_senteval.encoder = FastSent.load(args.model_file)
    se = senteval.SentEval(params_senteval, batcher, prepare)
    '''
    tasks = ['MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC',
             'SICKRelatedness', 'SICKEntailment', 'MRPC',
             'STS14', 'SQuAD', 'Quora']
    '''
    tasks = ['Quora', 'Reasoning']

    se.eval(tasks, small=args.small, lower=args.lower)
Ejemplo n.º 3
0
            results.append("{0:.4f}/{0:.4f}".format(stsbenchmark_dev_pear, stsbenchmark_test_pear))

        writer.writerow(results)


"""
Evaluation of trained model on Transfer Tasks (SentEval)
"""

# define transfer tasks
transfer_tasks = ['ABSA_CH'] if params.lang == 'CH' else ['ABSA_SP', 'STS_SP']

# define senteval params
# Can choose to use MLP instead
params_senteval = dotdict({'usepytorch': True, 'task_path': PATH_TO_DATA,
                           'seed': 1111, 'kfold': 5})

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":

    # We map cuda to the current cuda device
    # this only works when we set params.gpu_id = 0
    map_locations = {}
    for d in range(4):
        if d != params.gpu_id:
            map_locations['cuda:{}'.format(d)] = "cuda:{}".format(params.gpu_id)

    # collect number of epochs trained in directory
    model_files = filter(lambda s: params.outputmodelname + '-' in s and 'encoder' not in s,
Ejemplo n.º 4
0
        sentvec = []
        for word in sent:
            if word in params.word_vec:
                sentvec.append(params.word_vec[word])
        if not sentvec:
            sentvec.append(params.word_vec['.'])
        sentvec = np.mean(sentvec, 0)
        embeddings.append(sentvec)

    embeddings = np.vstack(embeddings)
    return embeddings


# Set params for SentEval
params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 5}
params_senteval = dotdict(params_senteval)

# set gpu device
torch.cuda.set_device(0)

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":
    se = senteval.SentEval(params_senteval, batcher, prepare)
    transfer_tasks = [
        'MR', 'CR', 'MPQA', 'SUBJ', 'SST', 'TREC', 'MRPC', 'SICKEntailment',
        'SICKRelatedness', 'STSBenchmark', 'STS14'
    ]
    results = se.eval(transfer_tasks)
Ejemplo n.º 5
0
# define transfer tasks
if params.dis:
    transfer_tasks = ['DIS']
elif params.pdtb:
    transfer_tasks = ['PDTB_IMEX']  # 'PDTB_EX'
elif params.dat:
    transfer_tasks = ['DAT']
else:
    transfer_tasks = ['MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC', 'SICKRelatedness',
                      'SICKEntailment', 'MRPC', 'STS14']

# define senteval params
if params.mlp:
    # keep nhid the same as DisSent model (otherwise we can try 1024)
    params_senteval = dotdict({'usepytorch': True, 'task_path': PATH_TO_DATA,
                               'seed': 1111, 'kfold': 5, 'classifier': 'MLP', 'nhid': 512,
                               'bilinear': params.bilinear})
else:
    params_senteval = dotdict({'usepytorch': True, 'task_path': PATH_TO_DATA,
                               'seed': 1111, 'kfold': 5, 'bilinear': params.bilinear})

# Set up logger
logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)

if __name__ == "__main__":

    # We map cuda to the current cuda device
    # this only works when we set params.gpu_id = 0
    map_locations = {}
    for d in range(4):
        if d != params.gpu_id:
Ejemplo n.º 6
0
def main(arguments):
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter)

    # Logistics
    parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0)
    parser.add_argument("--use_pytorch",
                        help="1 to use PyTorch",
                        type=int,
                        default=1)
    parser.add_argument("--log_file", help="File to log to", type=str)
    parser.add_argument("--model_file",
                        help="File to load model from",
                        type=str)
    parser.add_argument("--dict_file", help="File to load dict from", type=str)

    # Task options
    parser.add_argument("--tasks",
                        help="Tasks to evaluate on, as a comma separated list",
                        type=str)
    parser.add_argument("--max_seq_len",
                        help="Max sequence length",
                        type=int,
                        default=40)

    # Model options
    parser.add_argument("--batch_size",
                        help="Batch size to use",
                        type=int,
                        default=32)

    # Classifier options
    parser.add_argument("--cls_batch_size",
                        help="Batch size to use for classifier",
                        type=int,
                        default=32)

    args = parser.parse_args(arguments)
    logging.basicConfig(format='%(asctime)s : %(message)s',
                        level=logging.DEBUG)
    fileHandler = logging.FileHandler(args.log_file)
    logging.getLogger().addHandler(fileHandler)

    # Set params for SentEval
    params_senteval = {
        'usepytorch': True,
        'task_path': PATH_TO_DATA,
        'max_seq_len': args.max_seq_len,
        'batch_size': args.batch_size
    }
    params_senteval['classifier'] = {
        'nhid': 0,
        'optim': 'adam',
        'batch_size': args.cls_batch_size,
        'tenacity': 5,
        'epoch_size': 4
    }
    params_senteval = dotdict(params_senteval)

    with open(args.dict_file, 'rb') as fh:
        data = pkl.load(fh)
        word2idx = data[0]
    word2idx['<pad>'] = len(word2idx)
    n_words = len(word2idx)

    # Load model
    params_senteval.encoder = convsent.load_model(args.model_file,
                                                  n_words=n_words)
    params_senteval.word2idx = word2idx

    se = senteval.SentEval(params_senteval, batcher, prepare)
    tasks = args.tasks.split(',')
    results = se.eval(tasks)
    print(results)