def build_test_corpus(source_file, target_file): try: os.unlink(target_file) except: pass with open(source_file, 'w') as tf: for i in range(100): json.dump( { FieldNames.TITLE: ' '.join(random.sample(WORDS, 10)), FieldNames.ABSTRACT: ' '.join(random.sample(WORDS, 1000)), FieldNames.AUTHORS: [], FieldNames.OUT_CITATIONS: [str(x) for x in random.sample(range(100), 2)], FieldNames.IN_CITATION_COUNT: len([str(x) for x in random.sample(range(100), 2)]), FieldNames.KEY_PHRASES: random.sample(WORDS, 3), FieldNames.YEAR: 2011, FieldNames.PAPER_ID: str(i), FieldNames.VENUE: 'v-{}'.format(random.randint(1, 5)) }, tf) tf.write('\n') Corpus.build(target_file, source_file)
def _verify(db_filename, corpus_json): try: Corpus.build(db_filename=db_filename, source_json=corpus_json) except Exception as e: logging.critical( "Failed to build corpus {} for file {}".format( db_filename, corpus_json)) print(e)
def main(self, args): dp = DatasetPaths() if self.dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type)) else: corpus = Corpus.load(dp.get_db_path(self.dataset_type)) authors = Counter() key_phrases = Counter() years = Counter() venues = Counter() num_docs_with_kp = 0 in_citations_counts = [] out_citations_counts = [] for doc in corpus: authors.update(doc.authors) key_phrases.update(doc.key_phrases) if len(doc.key_phrases) > 0: num_docs_with_kp += 1 in_citations_counts.append(doc.in_citation_count) out_citations_counts.append(doc.out_citation_count) years.update([doc.year]) venues.update([doc.venue]) training_years = [corpus[doc_id].year for doc_id in corpus.train_ids] validation_years = [corpus[doc_id].year for doc_id in corpus.valid_ids] testing_years = [corpus[doc_id].year for doc_id in corpus.test_ids] print("No. of documents = {}".format(len(corpus))) print("Unique number of authors = {}".format(len(authors))) print("Unique number of key phrases = {}".format(len(key_phrases))) print("Unique number of venues = {}".format(len(venues))) print("No. of docs with key phrases = {}".format(num_docs_with_kp)) print("Average in citations = {} (+/- {})".format(np.mean(in_citations_counts), np.std(in_citations_counts))) print("Average out citations = {} (+/- {})".format(np.mean(out_citations_counts), np.std(out_citations_counts))) print("No. of training examples = {} ({} to {})".format(len(corpus.train_ids), np.min(training_years), np.max(training_years))) print("No. of validation examples = {} ({} to {})".format(len(corpus.valid_ids), np.min(validation_years), np.max(validation_years))) print("No. of testing examples = {} ({} to {})".format(len(corpus.test_ids), np.min(testing_years), np.max(testing_years))) print(authors.most_common(10))
def test_corpus_iterator(): corpus = Corpus.load('/tmp/foo.sqlite') iter_ids = [] for doc in corpus: iter_ids.append(doc.id) overlap_n = len(set(iter_ids).intersection(set(corpus.all_ids))) assert overlap_n == corpus.n_docs
def test_featurizer_and_data_gen(): build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite') corpus = Corpus.load('/tmp/foo.sqlite') featurizer = features.Featurizer() featurizer.fit(corpus, max_df_frac=1.0) dg = features.DataGenerator(corpus, featurizer) gen = dg.triplet_generator(paper_ids=corpus.all_ids, candidate_ids=corpus.all_ids, batch_size=128, neg_to_pos_ratio=5) # make sure we can get features for i in range(10): print(i) X, y = next(gen) # correct batch size assert len(y) >= 128 # positives, hard negatives, easy negatives assert len(np.unique(y)) == 3 # correct padding assert X['query-abstract-txt'].shape[1] == featurizer.max_abstract_len assert X['query-title-txt'].shape[1] == featurizer.max_title_len # no new words assert set(featurizer.word_indexer.word_to_index.keys()).difference( WORDS) == set() q, ex, labels = next(dg._listwise_examples(corpus.all_ids, corpus.all_ids)) # query id should not be in candidates assert q.id not in [i.id for i in ex] # pos ids should be out_citations pos_docs = [i.id for i, j in zip(ex, labels) if j == np.max(labels)] assert set(pos_docs) == set(q.out_citations) # neg ids should be NOT out_citations neg_docs = [i.id for i, j in zip(ex, labels) if j < np.max(labels)] assert np.all([i not in neg_docs for i in q.out_citations]) # test variable margin off dg = features.DataGenerator(corpus, featurizer, use_variable_margin=False) gen = dg.triplet_generator(paper_ids=corpus.all_ids, candidate_ids=corpus.all_ids, batch_size=128, neg_to_pos_ratio=5) X, y = next(gen) print(dg.margins_offset_dict) assert len(np.unique(y)) == 2
def _gold_citations(doc_id: str, corpus: Corpus, min_citations: int, candidate_ids_pool: set): gold_citations_1 = set(corpus.get_citations(doc_id)) if doc_id in gold_citations_1: gold_citations_1.remove(doc_id) citations_of_citations = [] for c in gold_citations_1: citations_of_citations.extend(corpus.get_citations(c)) gold_citations_2 = set(citations_of_citations).union(gold_citations_1) if doc_id in gold_citations_2: gold_citations_2.remove(doc_id) gold_citations_1.intersection_update(candidate_ids_pool) gold_citations_2.intersection_update(candidate_ids_pool) if len(gold_citations_1) < min_citations: return [], [] return gold_citations_1, gold_citations_2
def main(self, args): logging.info("Reading Open Corpus file from: {}".format( self.input_path)) logging.info("Writing json file to: {}".format(self.output_path)) dp = DatasetPaths() assert os.path.exists(self.input_path) assert not os.path.exists(self.output_path) assert not os.path.exists(dp.get_pkl_path('oc')) with open(self.output_path, 'w') as f: for obj in tqdm.tqdm(file_util.read_json_lines(self.input_path)): if 'year' not in obj: continue translated_obj = { FieldNames.PAPER_ID: obj['id'], FieldNames.TITLE_RAW: obj['title'], FieldNames.ABSTRACT_RAW: obj['paperAbstract'], FieldNames.AUTHORS: [a['name'] for a in obj['authors']], FieldNames.IN_CITATION_COUNT: len(obj['inCitations']), FieldNames.KEY_PHRASES: obj['keyPhrases'], FieldNames.OUT_CITATIONS: obj['outCitations'], FieldNames.URLS: obj['pdfUrls'], FieldNames.S2_URL: obj['s2Url'], FieldNames.VENUE: obj['venue'], FieldNames.YEAR: obj['year'], FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])), FieldNames.ABSTRACT: ' '.join(global_tokenizer(obj['paperAbstract'])) } f.write(json.dumps(translated_obj)) f.write("\n") f.close() oc_corpus = Corpus.build(dp.get_db_path('oc'), dp.get_json_path('oc')) pickle.dump(oc_corpus, open(dp.get_pkl_path('oc')))
def setUpClass(cls): build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite') corpus = Corpus.load('/tmp/foo.sqlite') options = ModelOptions(**{}) featurizer = Featurizer(max_title_len=options.max_title_len, max_abstract_len=options.max_abstract_len) featurizer.fit(corpus, max_df_frac=1.0) options.n_features = featurizer.n_features options.n_authors = featurizer.n_authors options.n_venues = featurizer.n_venues options.n_keyphrases = featurizer.n_keyphrases cls.corpus = corpus cls.featurizer = featurizer cls.options = options
def test_data_isolation(): build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite') corpus = Corpus.load('/tmp/foo.sqlite') assert len(set(corpus.train_ids).intersection(set(corpus.valid_ids))) == 0 assert len(set(corpus.train_ids).intersection(set(corpus.test_ids))) == 0 assert len(set(corpus.valid_ids).intersection(set(corpus.test_ids))) == 0 featurizer = features.Featurizer() featurizer.fit(corpus, max_df_frac=1.0) dg = features.DataGenerator(corpus, featurizer) query, examples, labels = next(dg._listwise_examples(corpus.train_ids)) examples_ids = [doc.id for doc in examples] assert len(set(examples_ids).intersection(set(corpus.valid_ids))) == 0 assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0 dg = features.DataGenerator(corpus, featurizer) query, examples, labels = next( dg._listwise_examples(paper_ids=corpus.valid_ids, candidate_ids=corpus.valid_ids + corpus.train_ids)) examples_ids = [doc.id for doc in examples] assert len(set(examples_ids).intersection(set(corpus.train_ids))) > 0 assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0 dg = features.DataGenerator(corpus, featurizer) query, examples, labels = next( dg._listwise_examples(paper_ids=corpus.test_ids, candidate_ids=corpus.valid_ids + corpus.train_ids)) examples_ids = [doc.id for doc in examples] assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0 dg = features.DataGenerator(corpus, featurizer) query, examples, labels = next( dg._listwise_examples(paper_ids=corpus.test_ids, candidate_ids=corpus.valid_ids + corpus.train_ids + corpus.test_ids)) examples_ids = [doc.id for doc in examples]
with open(output_path, 'w') as f: for obj in tqdm.tqdm(file_util.read_json_lines(input_path)): if 'year' not in obj: continue translated_obj = { FieldNames.PAPER_ID: obj['id'], FieldNames.TITLE_RAW: obj['title'], FieldNames.ABSTRACT_RAW: obj['paperAbstract'], FieldNames.AUTHORS: [a['name'] for a in obj['authors']], FieldNames.IN_CITATION_COUNT: 0, FieldNames.KEY_PHRASES: obj['keyPhrases'], FieldNames.OUT_CITATIONS: obj['outCitations'], FieldNames.URLS: obj['pdfUrls'], FieldNames.S2_URL: obj['s2Url'], FieldNames.VENUE: obj['venue'], FieldNames.YEAR: obj['year'], FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])), FieldNames.ABSTRACT: ' '.join(global_tokenizer(obj['paperAbstract'])) } s += 1 if s == 10: break f.write(json.dumps(translated_obj)) f.write("\n") f.close() oc_corpus = Corpus.build(dp.get_db_path('oc'), [output_path]) with open(output_pkl_path, 'wb') as f: pickle.dump(oc_corpus, f, -1)
FieldNames.PAPER_ID: obj['id'], FieldNames.TITLE_RAW: obj['title'], FieldNames.ABSTRACT_RAW: obj['paperAbstract'], FieldNames.AUTHORS: [a['name'] for a in obj['authors']], #FieldNames.IN_CITATION_COUNT: len(obj['inCitations']), FieldNames.KEY_PHRASES: obj['keyPhrases'], #FieldNames.OUT_CITATIONS: obj['outCitations'], FieldNames.URLS: obj['pdfUrls'], FieldNames.S2_URL: obj['s2Url'], FieldNames.VENUE: obj['venue'], FieldNames.YEAR: obj['year'], FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])), FieldNames.ABSTRACT: ' '.join(global_tokenizer(obj['paperAbstract'])) } f.write(json.dumps(translated_obj)) f.write("\n") f.close() oc_corpus = Corpus.load(dp.get_db_path('oc')) pickle.dump(oc_corpus, open(output_pkl_path, 'wb'))
def main(self, args): if self.dataset_name == 'dblp': input_path = DatasetPaths.DBLP_GOLD_DIR output_path = DatasetPaths.DBLP_CORPUS_JSON elif self.dataset_name == 'pubmed': input_path = DatasetPaths.PUBMED_GOLD_DIR output_path = DatasetPaths.PUBMED_CORPUS_JSON else: assert False logging.info("Reading Gold data from {}".format(input_path)) logging.info("Writing corpus to {}".format(output_path)) assert os.path.exists(input_path) assert not os.path.exists(output_path) papers_file = os.path.join(input_path, "papers.txt") abstracts_file = os.path.join(input_path, "abstracts.txt") keyphrases_file = os.path.join(input_path, "paper_keyphrases.txt") citations_file = os.path.join(input_path, "paper_paper.txt") authors_file = os.path.join(input_path, "paper_author.txt") venues_file = os.path.join(input_path, "paper_venue.txt") paper_titles = {} paper_years = {} paper_abstracts = {} paper_keyphrases = {} paper_citations = {} paper_in_citations = {} paper_authors = {} paper_venues = {} bad_ids = set() for line in file_util.read_lines(abstracts_file): parts = line.split("\t") paper_id = int(parts[0]) if len(parts) == 2: paper_abstracts[paper_id] = parts[1] else: paper_abstracts[paper_id] = "" if paper_abstracts[paper_id] == "": bad_ids.add(paper_id) for line in file_util.read_lines(papers_file): parts = line.split('\t') paper_id = int(parts[0]) paper_years[paper_id] = int(parts[2]) paper_titles[paper_id] = parts[3] for line in file_util.read_lines(keyphrases_file): parts = line.split("\t") paper_id = int(parts[0]) if paper_id not in paper_keyphrases: paper_keyphrases[paper_id] = [] for kp in parts[1:]: kp = kp.strip() if len(kp) > 0: paper_keyphrases[paper_id].append(kp[:-4]) for line in file_util.read_lines(citations_file): parts = line.split("\t") paper_id = int(parts[0]) if paper_id not in paper_citations: paper_citations[paper_id] = [] c = int(parts[1]) if c in bad_ids: continue paper_citations[paper_id].append(str(c)) if c not in paper_in_citations: paper_in_citations[c] = [] if paper_id not in paper_in_citations: paper_in_citations[paper_id] = [] paper_in_citations[c].append(paper_id) for line in file_util.read_lines(authors_file): parts = line.split("\t") paper_id = int(parts[0]) if paper_id not in paper_authors: paper_authors[paper_id] = [] paper_authors[paper_id].append(parts[1]) for line in file_util.read_lines(venues_file): parts = line.split("\t") paper_id = int(parts[0]) paper_venues[paper_id] = parts[1] test_paper_id = 13 print("==== Test Paper Details ====") print(paper_titles[test_paper_id]) print(paper_years[test_paper_id]) print(paper_abstracts[test_paper_id]) print(paper_keyphrases[test_paper_id]) print(paper_citations[test_paper_id]) print(paper_in_citations[test_paper_id]) print(paper_authors[test_paper_id]) print(paper_venues[test_paper_id]) print("==== Test Paper Details ====") def _print_len(x, name=''): print("No. of {} = {}".format(name, len(x))) _print_len(paper_titles, 'Titles') _print_len(paper_years, 'Years') _print_len(paper_abstracts, 'Abstracts') _print_len(paper_keyphrases, 'KeyPhrases') _print_len(paper_citations, 'Paper Citations') _print_len(paper_in_citations, 'Paper In citations') _print_len(paper_authors, ' Authors') _print_len(paper_venues, ' Venues') logging.info("Skipped {} papers due to insufficient data".format( len(bad_ids))) corpus = {} for id, title in tqdm.tqdm(paper_titles.items()): if id in bad_ids: continue doc = document_from_dict({ FieldNames.PAPER_ID: str(id), FieldNames.TITLE: ' '.join(global_tokenizer(title)), FieldNames.ABSTRACT: ' '.join(global_tokenizer(paper_abstracts[id])), FieldNames.OUT_CITATIONS: paper_citations.get(id, []), FieldNames.YEAR: paper_years[id], FieldNames.AUTHORS: paper_authors.get(id, []), FieldNames.KEY_PHRASES: paper_keyphrases[id], FieldNames.OUT_CITATION_COUNT: len(paper_citations.get(id, [])), FieldNames.IN_CITATION_COUNT: len(paper_in_citations.get(id, [])), FieldNames.VENUE: paper_venues.get(id, ''), FieldNames.TITLE_RAW: title, FieldNames.ABSTRACT_RAW: paper_abstracts[id] }) corpus[id] = doc with open(output_path, 'w') as f: for _, doc in corpus.items(): doc_json = dict_from_document(doc) f.write(json.dumps(doc_json)) f.write("\n") dp = DatasetPaths() Corpus.build(dp.get_db_path(self.dataset_name), dp.get_json_path(self.dataset_name))
def main(self, args): dp = DatasetPaths() if self.dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type)) else: corpus = Corpus.load(dp.get_db_path(self.dataset_type)) if self.ranker_type == 'none': citation_ranker = NoneRanker() elif self.ranker_type == 'neural': assert self.citation_ranker_dir is not None ranker_featurizer, ranker_models = model_from_directory( self.citation_ranker_dir, on_cpu=True) citation_ranker = Ranker( corpus=corpus, featurizer=ranker_featurizer, citation_ranker=ranker_models['citeomatic'], num_candidates_to_rank=100) else: assert False candidate_results_map = {} if self.num_candidates is None: if self.dataset_type == 'oc': num_candidates_list = [100] else: num_candidates_list = [1, 5, 10, 15, 25, 50, 75, 100] else: num_candidates_list = [self.num_candidates] for num_candidates in num_candidates_list: if self.candidate_selector_type == 'bm25': index_path = dp.get_bm25_index_path(self.dataset_type) candidate_selector = BM25CandidateSelector( corpus, index_path, num_candidates, False) elif self.candidate_selector_type == 'ann': assert self.paper_embedder_dir is not None featurizer, models = model_from_directory( self.paper_embedder_dir, on_cpu=True) candidate_selector = self._make_ann_candidate_selector( corpus=corpus, featurizer=featurizer, embedding_model=models['embedding'], num_candidates=num_candidates) elif self.candidate_selector_type == 'oracle': candidate_selector = OracleCandidateSelector(corpus) else: assert False results = eval_text_model(corpus, candidate_selector, citation_ranker, papers_source=self.split, n_eval=self.n_eval) candidate_results_map[num_candidates] = results best_k = -1 best_metric = 0.0 metric_key = self.metric + "_1" for k, v in candidate_results_map.items(): if best_metric < v[metric_key][EVAL_DATASET_KEYS[ self.dataset_type]]: best_k = k best_metric = v[metric_key][EVAL_DATASET_KEYS[ self.dataset_type]] print(json.dumps(candidate_results_map, indent=4, sort_keys=True)) print(best_k) print(best_metric)
def end_to_end_training(model_options: ModelOptions, dataset_type, models_dir, models_ann_dir=None): # step 1: make the directory if not os.path.exists(models_dir): os.makedirs(models_dir) # step 2: load the corpus DB print("Loading corpus db...") dp = DatasetPaths() db_file = dp.get_db_path(dataset_type) json_file = dp.get_json_path(dataset_type) if not os.path.isfile(db_file): print( "Have to build the database! This may take a while, but should only happen once." ) Corpus.build(db_file, json_file) if dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type)) else: corpus = Corpus.load(db_file, model_options.train_frac) # step 3: load/make the featurizer (once per hyperopt run) print("Making feautrizer") featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_' featurizer_file = os.path.join( models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME) if os.path.isfile(featurizer_file): featurizer = file_util.read_pickle(featurizer_file) else: featurizer = Featurizer( max_features=model_options.max_features, max_title_len=model_options.max_title_len, max_abstract_len=model_options.max_abstract_len, use_pretrained=model_options.use_pretrained, min_author_papers=model_options.min_author_papers, min_venue_papers=model_options.min_venue_papers, min_keyphrase_papers=model_options.min_keyphrase_papers) featurizer.fit(corpus, is_featurizer_for_test=model_options.train_for_test_set) file_util.write_pickle(featurizer_file, featurizer) # update model options after featurization model_options.n_authors = featurizer.n_authors model_options.n_venues = featurizer.n_venues model_options.n_keyphrases = featurizer.n_keyphrases model_options.n_features = featurizer.n_features if model_options.use_pretrained: model_options.dense_dim = model_options.dense_dim_pretrained # step 4: train the model citeomatic_model, embedding_model = train_text_model( corpus, featurizer, model_options, models_ann_dir=models_ann_dir, debug=True, tensorboard_dir=None) # step 5: save the model citeomatic_model.save_weights(os.path.join(models_dir, dp.CITEOMATIC_WEIGHTS_FILENAME), overwrite=True) if embedding_model is not None: embedding_model.save_weights(os.path.join( models_dir, dp.EMBEDDING_WEIGHTS_FILENAME), overwrite=True) file_util.write_json( os.path.join(models_dir, dp.OPTIONS_FILENAME), model_options.to_json(), ) return corpus, featurizer, model_options, citeomatic_model, embedding_model