def run_textrank(config): ''' First define the dataset, vocabulary, and keyphrase extractor ''' experiment_path = os.path.dirname(config.experiment_dir) kps_dir = os.path.join(experiment_path, 'keyphrases') if not os.path.isdir(kps_dir): os.makedirs(kps_dir) config.update(kp_setup_dir=kps_dir) print('starting setup') dataset = Dataset(directory=config.dataset['directory']) textrank_vocab = Vocab() # vocab used for textrank-based keyphrases full_vocab = Vocab() # vocab used on the full text print('keyphrase extraction') textrank_kps_by_id = {} full_kps_by_id = {} all_archives = itertools.chain( dataset.submissions(return_batches=True), dataset.archives(return_batches=True)) for archive_id, content_list in tqdm( all_archives, total=dataset.total_archive_count + dataset.submission_count): scored_kps = [] full_kps = [] for content in content_list: text = utils.content_to_text(content) top_tokens, full_tokens = keyphrases(text, include_scores=True, include_tokenlist=True) scored_kps.extend(top_tokens) full_kps.append(full_tokens) sorted_kps = [kp for kp, _ in sorted(scored_kps, key=lambda x: x[1], reverse=True)] top_kps = [] kp_count = 0 for kp in sorted_kps: if kp not in top_kps: top_kps.append(kp) kp_count += 1 if kp_count >= config.max_num_keyphrases: break textrank_vocab.load_items(top_kps) full_vocab.load_items([kp for archive in full_kps for kp in archive]) assert archive_id not in textrank_kps_by_id textrank_kps_by_id[archive_id] = top_kps full_kps_by_id[archive_id] = full_kps utils.dump_pkl(os.path.join(config.kp_setup_dir, 'textrank_kps_by_id.pkl'), textrank_kps_by_id) utils.dump_pkl(os.path.join(config.kp_setup_dir, 'full_kps_by_id.pkl'), full_kps_by_id) utils.dump_pkl(os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl'), textrank_vocab) utils.dump_pkl(os.path.join(config.kp_setup_dir, 'full_vocab.pkl'), full_vocab) return config
def test(config): dataset = Dataset(**config.dataset) labels_by_reviewer_by_forum = defaultdict(dict) for bid in dataset.bids(): label = 1 if bid.tag in dataset.positive_bid_values else 0 labels_by_reviewer_by_forum[bid.forum][bid.signatures[0]] = label inferred_scores_path = os.path.join(config.infer_dir, config.name + '-scores.jsonl') labeled_data_list = [] for data in utils.jsonl_reader(inferred_scores_path): forum = data['source_id'] reviewer = data['target_id'] score = float(data['score']) if not score >= 0.0: score = 0.0 if reviewer in labels_by_reviewer_by_forum[forum]: label = labels_by_reviewer_by_forum[forum][reviewer] labeled_data = {k: v for k, v in data.items()} labeled_data.update({'label': label, 'score': score}) labeled_data_list.append(labeled_data) config.test_save(labeled_data_list, 'score_labels.jsonl') labels_file = config.test_path('score_labels.jsonl') list_of_list_of_labels, list_of_list_of_scores = utils.load_labels( labels_file) map_score = float(eval_map(list_of_list_of_labels, list_of_list_of_scores)) hits_at_1 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=1)) hits_at_3 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=3)) hits_at_5 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=5)) hits_at_10 = float( eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=10)) score_lines = [[ config.name, text, data ] for text, data in [('MAP', map_score), ( 'Hits@1', hits_at_1), ('Hits@3', hits_at_3), ('Hits@5', hits_at_5), ('Hits@10', hits_at_10)]] config.test_save(score_lines, 'test.scores.tsv')
def infer(config): experiment_dir = Path(config['experiment_dir']).resolve() model = utils.load_pkl(config['tfidf_model']) dataset = Dataset(**config['dataset']) paperids = list(model.bow_archives_by_paperid.keys()) paperidx_by_id = {paperid: index for index, paperid in enumerate(paperids)} score_file_path = experiment_dir.joinpath(config['name'] + '-scores.csv') bids_by_forum = expertise.utils.get_bids_by_forum(dataset) submission_ids = [n for n in dataset.submission_ids] reviewer_ids = [r for r in dataset.reviewer_ids] # samples = expertise.utils.format_bid_labels(submission_ids, bids_by_forum) scores = {} max_score = 0.0 for paperid, userid in itertools.product(submission_ids, reviewer_ids): # label = data['label'] if userid not in scores: # bow_archive is a list of BOWs. if userid in model.bow_archives_by_userid and len( model.bow_archives_by_userid[userid]) > 0: bow_archive = model.bow_archives_by_userid[userid] else: bow_archive = [[]] best_scores = np.amax(model.index[bow_archive], axis=0) scores[userid] = best_scores user_max_score = max(best_scores) if user_max_score > max_score: max_score = user_max_score print('max score', max_score) with open(score_file_path, 'w') as w: for userid, user_scores in scores.items(): for paperidx, paper_score in enumerate(user_scores): paperid = paperids[paperidx] score = scores[userid][paperidx] / max_score w.write('{0},{1},{2:.3f}'.format(paperid, userid, score)) w.write('\n') return config
def setup(config): print('starting setup') dataset = Dataset(**config.dataset) bids_by_forum = utils.get_bids_by_forum(dataset) vocab = utils.load_pkl( os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl')) (train_set_ids, dev_set_ids, test_set_ids) = utils.split_ids(list(dataset.submission_ids), seed=config.random_seed) def fold_reader(id): fold_file = f'{id}.jsonl' fold_path = os.path.join(config.kp_setup_dir, 'folds', fold_file) return utils.jsonl_reader(fold_path) train_folds = [fold_reader(i) for i in train_set_ids] dev_folds = [fold_reader(i) for i in dev_set_ids] test_folds = [fold_reader(i) for i in test_set_ids] train_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*train_folds)) train_samples_path = os.path.join(config.setup_dir, 'train_samples.jsonl') utils.dump_jsonl(train_samples_path, train_samples) dev_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*dev_folds)) dev_samples_path = os.path.join(config.setup_dir, 'dev_samples.jsonl') utils.dump_jsonl(dev_samples_path, dev_samples) test_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*test_folds)) test_samples_path = os.path.join(config.setup_dir, 'test_samples.jsonl') utils.dump_jsonl(test_samples_path, test_samples) # features_dir = './scibert_features/akbc19/setup/archives-features/' features_dir = config.bert_features_dir archive_features_dir = os.path.join(features_dir, 'archives-features') submission_features_dir = os.path.join(features_dir, 'submissions-features')
def setup(config): print('starting setup') setup_dir = os.path.join(config.experiment_dir, 'setup') if not os.path.exists(setup_dir): os.mkdir(setup_dir) dataset = Dataset(**config.dataset) vocab = utils.load_pkl( os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl')) (train_set_ids, dev_set_ids, test_set_ids) = utils.split_ids(list(dataset.submission_ids), seed=config.random_seed) def fold_reader(id): fold_file = f'{id}.jsonl' fold_path = os.path.join(config.bpr_samples, fold_file) return utils.jsonl_reader(fold_path) train_folds = [fold_reader(i) for i in train_set_ids] dev_folds = [fold_reader(i) for i in dev_set_ids] test_folds = [fold_reader(i) for i in test_set_ids] train_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*train_folds)) train_samples_path = os.path.join(setup_dir, 'train_samples.jsonl') utils.dump_jsonl(train_samples_path, train_samples) dev_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*dev_folds)) dev_samples_path = os.path.join(setup_dir, 'dev_samples.jsonl') utils.dump_jsonl(dev_samples_path, dev_samples) test_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*test_folds)) test_samples_path = os.path.join(setup_dir, 'test_samples.jsonl') utils.dump_jsonl(test_samples_path, test_samples) return config
def setup(config): assert os.path.exists( config.tpms_scores_file ), 'This model requires a pre-computed tpms score file.' dataset = Dataset(**config.dataset) experiment_dir = os.path.abspath(config.experiment_dir) setup_dir = os.path.join(experiment_dir, 'setup') if not os.path.exists(setup_dir): os.mkdir(setup_dir) (train_set_ids, dev_set_ids, test_set_ids) = utils.split_ids(list(dataset.submission_ids), seed=config.random_seed) bids_by_forum = utils.get_bids_by_forum(dataset) test_labels = utils.format_bid_labels(test_set_ids, bids_by_forum) utils.dump_jsonl(os.path.join(config.setup_dir, 'test_labels.jsonl'), test_labels)
def test(config): dataset = Dataset(**config.dataset) model = expertise.utils.load_pkl(os.path.join(config.train_dir, 'model.pkl')) paperidx_by_id = { paperid: index for index, paperid in enumerate(model.bow_archives_by_paperid.keys()) } test_dir = os.path.join(config.experiment_dir, 'test') if not os.path.isdir(test_dir): os.mkdir(test_dir) config.update(test_dir=test_dir) score_file_path = os.path.join(config.test_dir, 'test_scores.jsonl') labels_file_path = os.path.join(config.setup_dir, 'test_labels.jsonl') scores = {} with open(score_file_path, 'w') as w: for data in expertise.utils.jsonl_reader(labels_file_path): paperid = data['source_id'] userid = data['target_id'] label = data['label'] if userid not in scores: # bow_archive is a list of BOWs. if userid in model.bow_archives_by_userid and len(model.bow_archives_by_userid[userid]) > 0: bow_archive = model.bow_archives_by_userid[userid] else: bow_archive = [[]] best_scores = np.amax(model.index[bow_archive], axis=0) scores[userid] = best_scores if paperid in paperidx_by_id: paper_index = paperidx_by_id[paperid] score = scores[userid][paper_index] result = { 'source_id': paperid, 'target_id': userid, 'score': float(score), 'label': int(label) } w.write(json.dumps(result) + '\n') (list_of_list_of_labels, list_of_list_of_scores) = expertise.utils.load_labels(score_file_path) map_score = float(eval_map(list_of_list_of_labels, list_of_list_of_scores)) hits_at_1 = float(eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=1)) hits_at_3 = float(eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=3)) hits_at_5 = float(eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=5)) hits_at_10 = float(eval_hits_at_k(list_of_list_of_labels, list_of_list_of_scores, k=10)) score_lines = [ [config.name, text, data] for text, data in [ ('MAP', map_score), ('Hits@1', hits_at_1), ('Hits@3', hits_at_3), ('Hits@5', hits_at_5), ('Hits@10', hits_at_10) ] ] expertise.utils.dump_csv( os.path.join(config.test_dir, 'test.scores.tsv'), score_lines)