Esempio n. 1
0
def test(config):
    print('config.best_model_path', config.best_model_path)
    model = torch.load(config.best_model_path)

    test_samples_path = os.path.join(
        config.setup_dir, 'test_samples.jsonl')


    batcher = Batcher(input_file=config.setup_path(test_samples_path))
    # a lookup table of torch.Tensor objects, keyed by user/paper ID.
    bert_lookup = utils.load_pkl(os.path.join(config.kp_setup_dir, 'bert_lookup.pkl'))

    predictions = centroid_scibert.generate_predictions(config, model, batcher, bert_lookup)

    prediction_filename = config.test_save(predictions,
        'test.predictions.jsonl')

    print('prediction filename', prediction_filename)
    map_score = float(centroid_scibert.eval_map_file(prediction_filename))
    hits_at_1 = float(centroid_scibert.eval_hits_at_k_file(prediction_filename, 1))
    hits_at_3 = float(centroid_scibert.eval_hits_at_k_file(prediction_filename, 3))
    hits_at_5 = float(centroid_scibert.eval_hits_at_k_file(prediction_filename, 5))
    hits_at_10 = float(centroid_scibert.eval_hits_at_k_file(prediction_filename, 10))

    score_lines = [
        [config.name, text, data] for text, data in [
            ('MAP', map_score),
            ('Hits@1', hits_at_1),
            ('Hits@3', hits_at_3),
            ('Hits@5', hits_at_5),
            ('Hits@10', hits_at_10)
        ]
    ]
    config.test_save(score_lines, 'test.scores.tsv')
Esempio n. 2
0
def infer(config):
    experiment_dir = Path(config['experiment_dir']).resolve()

    model = utils.load_pkl(config['tfidf_model'])

    dataset = Dataset(**config['dataset'])

    paperids = list(model.bow_archives_by_paperid.keys())
    paperidx_by_id = {paperid: index for index, paperid in enumerate(paperids)}

    score_file_path = experiment_dir.joinpath(config['name'] + '-scores.csv')

    bids_by_forum = expertise.utils.get_bids_by_forum(dataset)
    submission_ids = [n for n in dataset.submission_ids]
    reviewer_ids = [r for r in dataset.reviewer_ids]
    # samples = expertise.utils.format_bid_labels(submission_ids, bids_by_forum)

    scores = {}
    max_score = 0.0
    for paperid, userid in itertools.product(submission_ids, reviewer_ids):
        # label = data['label']

        if userid not in scores:
            # bow_archive is a list of BOWs.
            if userid in model.bow_archives_by_userid and len(
                    model.bow_archives_by_userid[userid]) > 0:
                bow_archive = model.bow_archives_by_userid[userid]
            else:
                bow_archive = [[]]

            best_scores = np.amax(model.index[bow_archive], axis=0)
            scores[userid] = best_scores

            user_max_score = max(best_scores)
            if user_max_score > max_score:
                max_score = user_max_score

    print('max score', max_score)

    with open(score_file_path, 'w') as w:
        for userid, user_scores in scores.items():
            for paperidx, paper_score in enumerate(user_scores):
                paperid = paperids[paperidx]
                score = scores[userid][paperidx] / max_score

                w.write('{0},{1},{2:.3f}'.format(paperid, userid, score))
                w.write('\n')

    return config
def setup(config):

    print('starting setup')
    dataset = Dataset(**config.dataset)
    bids_by_forum = utils.get_bids_by_forum(dataset)
    vocab = utils.load_pkl(
        os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl'))

    (train_set_ids, dev_set_ids,
     test_set_ids) = utils.split_ids(list(dataset.submission_ids),
                                     seed=config.random_seed)

    def fold_reader(id):
        fold_file = f'{id}.jsonl'
        fold_path = os.path.join(config.kp_setup_dir, 'folds', fold_file)
        return utils.jsonl_reader(fold_path)

    train_folds = [fold_reader(i) for i in train_set_ids]
    dev_folds = [fold_reader(i) for i in dev_set_ids]
    test_folds = [fold_reader(i) for i in test_set_ids]

    train_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                     for data in itertools.chain(*train_folds))

    train_samples_path = os.path.join(config.setup_dir, 'train_samples.jsonl')

    utils.dump_jsonl(train_samples_path, train_samples)

    dev_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                   for data in itertools.chain(*dev_folds))

    dev_samples_path = os.path.join(config.setup_dir, 'dev_samples.jsonl')

    utils.dump_jsonl(dev_samples_path, dev_samples)

    test_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                    for data in itertools.chain(*test_folds))

    test_samples_path = os.path.join(config.setup_dir, 'test_samples.jsonl')

    utils.dump_jsonl(test_samples_path, test_samples)

    # features_dir = './scibert_features/akbc19/setup/archives-features/'
    features_dir = config.bert_features_dir
    archive_features_dir = os.path.join(features_dir, 'archives-features')
    submission_features_dir = os.path.join(features_dir,
                                           'submissions-features')
Esempio n. 4
0
def setup(config):
    print('starting setup')
    setup_dir = os.path.join(config.experiment_dir, 'setup')
    if not os.path.exists(setup_dir):
        os.mkdir(setup_dir)

    dataset = Dataset(**config.dataset)
    vocab = utils.load_pkl(
        os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl'))

    (train_set_ids, dev_set_ids,
     test_set_ids) = utils.split_ids(list(dataset.submission_ids),
                                     seed=config.random_seed)

    def fold_reader(id):
        fold_file = f'{id}.jsonl'
        fold_path = os.path.join(config.bpr_samples, fold_file)
        return utils.jsonl_reader(fold_path)

    train_folds = [fold_reader(i) for i in train_set_ids]
    dev_folds = [fold_reader(i) for i in dev_set_ids]
    test_folds = [fold_reader(i) for i in test_set_ids]

    train_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                     for data in itertools.chain(*train_folds))

    train_samples_path = os.path.join(setup_dir, 'train_samples.jsonl')

    utils.dump_jsonl(train_samples_path, train_samples)

    dev_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                   for data in itertools.chain(*dev_folds))

    dev_samples_path = os.path.join(setup_dir, 'dev_samples.jsonl')

    utils.dump_jsonl(dev_samples_path, dev_samples)

    test_samples = (data_to_sample(data, vocab, config.max_num_keyphrases)
                    for data in itertools.chain(*test_folds))

    test_samples_path = os.path.join(setup_dir, 'test_samples.jsonl')

    utils.dump_jsonl(test_samples_path, test_samples)

    return config
Esempio n. 5
0
def setup_bert_kps_lookup(config):

    print('starting setup')
    # features_dir = config.bert_features_dir
    archive_features_dir = os.path.join(config.experiment_dir, 'setup',
                                        'archives-features')
    submission_features_dir = os.path.join(config.experiment_dir, 'setup',
                                           'submissions-features')
    textrank_kps = utils.load_pkl(
        os.path.join(config.setup_dir, 'textrank_kps_by_id.pkl'))

    bert_lookup = {}

    for target_dir in [archive_features_dir, submission_features_dir]:
        for filename in os.listdir(target_dir):
            print(filename)
            item_id = filename.replace('.npy', '')
            filepath = os.path.join(target_dir, filename)
            archives = np.load(filepath)

            document_kps = textrank_kps[item_id]
            kps_seen = []
            kp_features = []

            for document in archives:
                features = document['features']
                for feature in features:
                    if feature['token'] in document_kps and feature[
                            'token'] not in kps_seen:
                        kps_seen.append(feature['token'])
                        kp_features.append(feature['layers'][-1]['values'])

            kp_features = kp_features[:config.max_num_keyphrases]

            while len(kp_features) < config.max_num_keyphrases:
                kp_features.append(np.zeros(config.bert_dim))

            result = np.array(kp_features)
            bert_lookup[item_id] = torch.Tensor(result)

    return bert_lookup
def train(config):

    for train_subdir in ['dev_scores', 'dev_predictions']:
        train_subdir_path = os.path.join(config.train_dir, train_subdir)
        if not os.path.exists(train_subdir_path):
            os.mkdir(train_subdir_path)

    vocab_path = os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl')
    vocab = utils.load_pkl(vocab_path)

    torch.manual_seed(config.random_seed)

    train_samples_path = os.path.join(config.setup_dir, 'train_samples.jsonl')

    dev_samples_path = os.path.join(config.setup_dir, 'dev_samples.jsonl')

    print('reading train samples from ', train_samples_path)
    batcher = Batcher(input_file=train_samples_path)
    batcher_dev = Batcher(input_file=dev_samples_path)

    model = centroid_scibert.Model(config, vocab)
    if config.use_cuda:
        model = model.cuda()

    optimizer = optim.Adam(model.parameters(),
                           lr=config.learning_rate,
                           weight_decay=config.l2penalty)

    # Stats
    best_map = 0
    sum_loss = 0.0

    # a lookup table of torch.Tensor objects, keyed by user/paper ID.
    bert_lookup = utils.load_pkl(
        os.path.join(config.kp_setup_dir, 'bert_lookup.pkl'))

    print('Begin Training')

    # Training loop
    for counter, batch in enumerate(
            batcher.batches(batch_size=config.batch_size)):

        batch_source = []
        batch_pos = []
        batch_neg = []

        for data in batch:
            batch_source.append(bert_lookup[data['source_id']])
            batch_pos.append(bert_lookup[data['positive_id']])
            batch_neg.append(bert_lookup[data['negative_id']])

        print('num_batches: {}'.format(counter))
        optimizer.zero_grad()

        loss_parameters = (torch.stack(batch_source), torch.stack(batch_pos),
                           torch.stack(batch_neg))

        loss = model.compute_loss(*loss_parameters)
        loss.backward()

        # torch.nn.utils.clip_grad_norm(model.parameters(), config.clip)
        optimizer.step()

        # Question: is this if block just for monitoring?
        if counter % 100 == 0:

            this_loss = loss.cpu().data.numpy()
            sum_loss += this_loss

            print(
                'Processed {} batches, Loss of batch {}: {}. Average loss: {}'.
                format(counter, counter, this_loss,
                       sum_loss / (counter / 100)))

        if counter % config.eval_every == 0:

            # is this reset needed?
            batcher_dev.reset()

            predictions = centroid_scibert.generate_predictions(
                config, model, batcher_dev, bert_lookup)

            prediction_filename = config.train_save(
                predictions,
                'dev_predictions/dev.predictions.{}.jsonl'.format(counter))

            print('prediction filename', prediction_filename)
            map_score = float(
                centroid_scibert.eval_map_file(prediction_filename))
            hits_at_1 = float(
                centroid_scibert.eval_hits_at_k_file(prediction_filename, 1))
            hits_at_3 = float(
                centroid_scibert.eval_hits_at_k_file(prediction_filename, 3))
            hits_at_5 = float(
                centroid_scibert.eval_hits_at_k_file(prediction_filename, 5))
            hits_at_10 = float(
                centroid_scibert.eval_hits_at_k_file(prediction_filename, 10))

            score_lines = [[
                config.name, counter, text, data
            ] for text, data in [('MAP', map_score), (
                'Hits@1',
                hits_at_1), ('Hits@3',
                             hits_at_3), ('Hits@5',
                                          hits_at_5), ('Hits@10', hits_at_10)]]
            config.train_save(score_lines,
                              'dev_scores/dev.scores.{}.tsv'.format(counter))

            if map_score > best_map:
                best_map = map_score

                best_model_path = os.path.join(
                    config.train_dir,
                    'model_{}_{}.torch'.format(config.name, 'best'))

                torch.save(model, best_model_path)
                config.best_model_path = best_model_path
                config.best_map_score = best_map
                config.hits_at_1 = hits_at_1
                config.hits_at_3 = hits_at_3
                config.hits_at_5 = hits_at_5
                config.hits_at_10 = hits_at_10
                config.save_config()

                config.train_save(score_lines, 'dev.scores.best.tsv')

        if counter == config.num_minibatches:
            break
Esempio n. 7
0
def train(config):

    train_dir = os.path.join(config.experiment_dir, 'train')
    if not os.path.isdir(train_dir):
        os.mkdir(train_dir)

    for train_subdir in ['dev_scores', 'dev_predictions']:
        train_subdir_path = os.path.join(train_dir, train_subdir)
        if not os.path.exists(train_subdir_path):
            os.mkdir(train_subdir_path)

    vocab_file = os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl')
    vocab = utils.load_pkl(vocab_file)

    torch.manual_seed(config.random_seed)

    batcher = Batcher(input_file=os.path.join(config.experiment_dir, 'setup',
                                              'train_samples.jsonl'))
    batcher_dev = Batcher(input_file=os.path.join(
        config.experiment_dir, 'setup', 'dev_samples.jsonl'))

    model = centroid.Model(config, vocab)
    if config.use_cuda:
        model = model.cuda()

    optimizer = optim.Adam(model.parameters(),
                           lr=config.learning_rate,
                           weight_decay=config.l2penalty)

    # Stats
    best_map = 0
    sum_loss = 0.0

    print('Begin Training')

    # Training loop
    for counter, batch in enumerate(
            batcher.batches(batch_size=config.batch_size)):
        batch_source = []
        batch_pos = []
        batch_neg = []
        batch_source_lens = []
        batch_pos_lens = []
        batch_neg_lens = []

        for data in batch:
            batch_source.append(np.asarray(data['source']))
            batch_pos.append(np.asarray(data['positive']))
            batch_neg.append(np.asarray(data['negative']))
            batch_source_lens.append(
                np.asarray(data['source_length'], dtype=np.float32))
            batch_pos_lens.append(
                np.asarray(data['positive_length'], dtype=np.float32))
            batch_neg_lens.append(
                np.asarray(data['negative_length'], dtype=np.float32))

        print('num_batches: {}'.format(counter))
        optimizer.zero_grad()

        loss_parameters = (np.asarray(batch_source), np.asarray(batch_pos),
                           np.asarray(batch_neg),
                           np.asarray(batch_source_lens, dtype=np.float32),
                           np.asarray(batch_pos_lens, dtype=np.float32),
                           np.asarray(batch_neg_lens, dtype=np.float32))

        loss = model.compute_loss(*loss_parameters)
        loss.backward()

        # torch.nn.utils.clip_grad_norm(model.parameters(), config.clip)
        optimizer.step()

        # Question: is this if block just for monitoring?
        if counter % 100 == 0:

            this_loss = loss.cpu().data.numpy()
            sum_loss += this_loss

            print(
                'Processed {} batches, Loss of batch {}: {}. Average loss: {}'.
                format(counter, counter, this_loss,
                       sum_loss / (counter / 100)))

        if counter % config.eval_every == 0:

            # is this reset needed?
            batcher_dev.reset()

            predictions = centroid.generate_predictions(
                config, model, batcher_dev)

            prediction_filename = os.path.join(
                train_dir,
                'dev_predictions/dev.predictions.{}.jsonl'.format(counter))

            utils.dump_jsonl(prediction_filename, predictions)

            print('prediction filename', prediction_filename)
            map_score = float(centroid.eval_map_file(prediction_filename))
            hits_at_1 = float(
                centroid.eval_hits_at_k_file(prediction_filename, 1))
            hits_at_3 = float(
                centroid.eval_hits_at_k_file(prediction_filename, 3))
            hits_at_5 = float(
                centroid.eval_hits_at_k_file(prediction_filename, 5))
            hits_at_10 = float(
                centroid.eval_hits_at_k_file(prediction_filename, 10))

            score_lines = [[
                config.name, counter, text, data
            ] for text, data in [('MAP', map_score), (
                'Hits@1',
                hits_at_1), ('Hits@3',
                             hits_at_3), ('Hits@5',
                                          hits_at_5), ('Hits@10', hits_at_10)]]
            dev_scores_file = os.path.join(
                train_dir, 'dev_scores/dev.scores.{}.tsv'.format(counter))
            utils.dump_csv(dev_scores_file, score_lines)

            if map_score > best_map:
                best_map = map_score

                best_model_path = os.path.join(
                    train_dir, 'model_{}_{}.torch'.format(config.name, 'best'))

                torch.save(model, best_model_path)
                config.update(best_model_path=best_model_path)

                best_scores_file = os.path.join(train_dir,
                                                'dev.scores.best.tsv')

                utils.dump_csv(best_scores_file, score_lines)

        if counter == config.num_minibatches:
            return config
    return config