def test_corpus_iterator(): corpus = Corpus.load('/tmp/foo.sqlite') iter_ids = [] for doc in corpus: iter_ids.append(doc.id) overlap_n = len(set(iter_ids).intersection(set(corpus.all_ids))) assert overlap_n == corpus.n_docs
def test_featurizer_and_data_gen(): build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite') corpus = Corpus.load('/tmp/foo.sqlite') featurizer = features.Featurizer() featurizer.fit(corpus, max_df_frac=1.0) dg = features.DataGenerator(corpus, featurizer) gen = dg.triplet_generator(paper_ids=corpus.all_ids, candidate_ids=corpus.all_ids, batch_size=128, neg_to_pos_ratio=5) # make sure we can get features for i in range(10): print(i) X, y = next(gen) # correct batch size assert len(y) >= 128 # positives, hard negatives, easy negatives assert len(np.unique(y)) == 3 # correct padding assert X['query-abstract-txt'].shape[1] == featurizer.max_abstract_len assert X['query-title-txt'].shape[1] == featurizer.max_title_len # no new words assert set(featurizer.word_indexer.word_to_index.keys()).difference( WORDS) == set() q, ex, labels = next(dg._listwise_examples(corpus.all_ids, corpus.all_ids)) # query id should not be in candidates assert q.id not in [i.id for i in ex] # pos ids should be out_citations pos_docs = [i.id for i, j in zip(ex, labels) if j == np.max(labels)] assert set(pos_docs) == set(q.out_citations) # neg ids should be NOT out_citations neg_docs = [i.id for i, j in zip(ex, labels) if j < np.max(labels)] assert np.all([i not in neg_docs for i in q.out_citations]) # test variable margin off dg = features.DataGenerator(corpus, featurizer, use_variable_margin=False) gen = dg.triplet_generator(paper_ids=corpus.all_ids, candidate_ids=corpus.all_ids, batch_size=128, neg_to_pos_ratio=5) X, y = next(gen) print(dg.margins_offset_dict) assert len(np.unique(y)) == 2
def main(self, args): dp = DatasetPaths() if self.dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type)) else: corpus = Corpus.load(dp.get_db_path(self.dataset_type)) authors = Counter() key_phrases = Counter() years = Counter() venues = Counter() num_docs_with_kp = 0 in_citations_counts = [] out_citations_counts = [] for doc in corpus: authors.update(doc.authors) key_phrases.update(doc.key_phrases) if len(doc.key_phrases) > 0: num_docs_with_kp += 1 in_citations_counts.append(doc.in_citation_count) out_citations_counts.append(doc.out_citation_count) years.update([doc.year]) venues.update([doc.venue]) training_years = [corpus[doc_id].year for doc_id in corpus.train_ids] validation_years = [corpus[doc_id].year for doc_id in corpus.valid_ids] testing_years = [corpus[doc_id].year for doc_id in corpus.test_ids] print("No. of documents = {}".format(len(corpus))) print("Unique number of authors = {}".format(len(authors))) print("Unique number of key phrases = {}".format(len(key_phrases))) print("Unique number of venues = {}".format(len(venues))) print("No. of docs with key phrases = {}".format(num_docs_with_kp)) print("Average in citations = {} (+/- {})".format(np.mean(in_citations_counts), np.std(in_citations_counts))) print("Average out citations = {} (+/- {})".format(np.mean(out_citations_counts), np.std(out_citations_counts))) print("No. of training examples = {} ({} to {})".format(len(corpus.train_ids), np.min(training_years), np.max(training_years))) print("No. of validation examples = {} ({} to {})".format(len(corpus.valid_ids), np.min(validation_years), np.max(validation_years))) print("No. of testing examples = {} ({} to {})".format(len(corpus.test_ids), np.min(testing_years), np.max(testing_years))) print(authors.most_common(10))
def setUpClass(cls): build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite') corpus = Corpus.load('/tmp/foo.sqlite') options = ModelOptions(**{}) featurizer = Featurizer(max_title_len=options.max_title_len, max_abstract_len=options.max_abstract_len) featurizer.fit(corpus, max_df_frac=1.0) options.n_features = featurizer.n_features options.n_authors = featurizer.n_authors options.n_venues = featurizer.n_venues options.n_keyphrases = featurizer.n_keyphrases cls.corpus = corpus cls.featurizer = featurizer cls.options = options
def test_data_isolation(): build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite') corpus = Corpus.load('/tmp/foo.sqlite') assert len(set(corpus.train_ids).intersection(set(corpus.valid_ids))) == 0 assert len(set(corpus.train_ids).intersection(set(corpus.test_ids))) == 0 assert len(set(corpus.valid_ids).intersection(set(corpus.test_ids))) == 0 featurizer = features.Featurizer() featurizer.fit(corpus, max_df_frac=1.0) dg = features.DataGenerator(corpus, featurizer) query, examples, labels = next(dg._listwise_examples(corpus.train_ids)) examples_ids = [doc.id for doc in examples] assert len(set(examples_ids).intersection(set(corpus.valid_ids))) == 0 assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0 dg = features.DataGenerator(corpus, featurizer) query, examples, labels = next( dg._listwise_examples(paper_ids=corpus.valid_ids, candidate_ids=corpus.valid_ids + corpus.train_ids)) examples_ids = [doc.id for doc in examples] assert len(set(examples_ids).intersection(set(corpus.train_ids))) > 0 assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0 dg = features.DataGenerator(corpus, featurizer) query, examples, labels = next( dg._listwise_examples(paper_ids=corpus.test_ids, candidate_ids=corpus.valid_ids + corpus.train_ids)) examples_ids = [doc.id for doc in examples] assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0 dg = features.DataGenerator(corpus, featurizer) query, examples, labels = next( dg._listwise_examples(paper_ids=corpus.test_ids, candidate_ids=corpus.valid_ids + corpus.train_ids + corpus.test_ids)) examples_ids = [doc.id for doc in examples]
FieldNames.PAPER_ID: obj['id'], FieldNames.TITLE_RAW: obj['title'], FieldNames.ABSTRACT_RAW: obj['paperAbstract'], FieldNames.AUTHORS: [a['name'] for a in obj['authors']], #FieldNames.IN_CITATION_COUNT: len(obj['inCitations']), FieldNames.KEY_PHRASES: obj['keyPhrases'], #FieldNames.OUT_CITATIONS: obj['outCitations'], FieldNames.URLS: obj['pdfUrls'], FieldNames.S2_URL: obj['s2Url'], FieldNames.VENUE: obj['venue'], FieldNames.YEAR: obj['year'], FieldNames.TITLE: ' '.join(global_tokenizer(obj['title'])), FieldNames.ABSTRACT: ' '.join(global_tokenizer(obj['paperAbstract'])) } f.write(json.dumps(translated_obj)) f.write("\n") f.close() oc_corpus = Corpus.load(dp.get_db_path('oc')) pickle.dump(oc_corpus, open(output_pkl_path, 'wb'))
def main(self, args): dp = DatasetPaths() if self.dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type)) else: corpus = Corpus.load(dp.get_db_path(self.dataset_type)) if self.ranker_type == 'none': citation_ranker = NoneRanker() elif self.ranker_type == 'neural': assert self.citation_ranker_dir is not None ranker_featurizer, ranker_models = model_from_directory( self.citation_ranker_dir, on_cpu=True) citation_ranker = Ranker( corpus=corpus, featurizer=ranker_featurizer, citation_ranker=ranker_models['citeomatic'], num_candidates_to_rank=100) else: assert False candidate_results_map = {} if self.num_candidates is None: if self.dataset_type == 'oc': num_candidates_list = [100] else: num_candidates_list = [1, 5, 10, 15, 25, 50, 75, 100] else: num_candidates_list = [self.num_candidates] for num_candidates in num_candidates_list: if self.candidate_selector_type == 'bm25': index_path = dp.get_bm25_index_path(self.dataset_type) candidate_selector = BM25CandidateSelector( corpus, index_path, num_candidates, False) elif self.candidate_selector_type == 'ann': assert self.paper_embedder_dir is not None featurizer, models = model_from_directory( self.paper_embedder_dir, on_cpu=True) candidate_selector = self._make_ann_candidate_selector( corpus=corpus, featurizer=featurizer, embedding_model=models['embedding'], num_candidates=num_candidates) elif self.candidate_selector_type == 'oracle': candidate_selector = OracleCandidateSelector(corpus) else: assert False results = eval_text_model(corpus, candidate_selector, citation_ranker, papers_source=self.split, n_eval=self.n_eval) candidate_results_map[num_candidates] = results best_k = -1 best_metric = 0.0 metric_key = self.metric + "_1" for k, v in candidate_results_map.items(): if best_metric < v[metric_key][EVAL_DATASET_KEYS[ self.dataset_type]]: best_k = k best_metric = v[metric_key][EVAL_DATASET_KEYS[ self.dataset_type]] print(json.dumps(candidate_results_map, indent=4, sort_keys=True)) print(best_k) print(best_metric)
def end_to_end_training(model_options: ModelOptions, dataset_type, models_dir, models_ann_dir=None): # step 1: make the directory if not os.path.exists(models_dir): os.makedirs(models_dir) # step 2: load the corpus DB print("Loading corpus db...") dp = DatasetPaths() db_file = dp.get_db_path(dataset_type) json_file = dp.get_json_path(dataset_type) if not os.path.isfile(db_file): print( "Have to build the database! This may take a while, but should only happen once." ) Corpus.build(db_file, json_file) if dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type)) else: corpus = Corpus.load(db_file, model_options.train_frac) # step 3: load/make the featurizer (once per hyperopt run) print("Making feautrizer") featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_' featurizer_file = os.path.join( models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME) if os.path.isfile(featurizer_file): featurizer = file_util.read_pickle(featurizer_file) else: featurizer = Featurizer( max_features=model_options.max_features, max_title_len=model_options.max_title_len, max_abstract_len=model_options.max_abstract_len, use_pretrained=model_options.use_pretrained, min_author_papers=model_options.min_author_papers, min_venue_papers=model_options.min_venue_papers, min_keyphrase_papers=model_options.min_keyphrase_papers) featurizer.fit(corpus, is_featurizer_for_test=model_options.train_for_test_set) file_util.write_pickle(featurizer_file, featurizer) # update model options after featurization model_options.n_authors = featurizer.n_authors model_options.n_venues = featurizer.n_venues model_options.n_keyphrases = featurizer.n_keyphrases model_options.n_features = featurizer.n_features if model_options.use_pretrained: model_options.dense_dim = model_options.dense_dim_pretrained # step 4: train the model citeomatic_model, embedding_model = train_text_model( corpus, featurizer, model_options, models_ann_dir=models_ann_dir, debug=True, tensorboard_dir=None) # step 5: save the model citeomatic_model.save_weights(os.path.join(models_dir, dp.CITEOMATIC_WEIGHTS_FILENAME), overwrite=True) if embedding_model is not None: embedding_model.save_weights(os.path.join( models_dir, dp.EMBEDDING_WEIGHTS_FILENAME), overwrite=True) file_util.write_json( os.path.join(models_dir, dp.OPTIONS_FILENAME), model_options.to_json(), ) return corpus, featurizer, model_options, citeomatic_model, embedding_model