예제 #1
0
def test_corpus_iterator():
    corpus = Corpus.load('/tmp/foo.sqlite')
    iter_ids = []
    for doc in corpus:
        iter_ids.append(doc.id)
    overlap_n = len(set(iter_ids).intersection(set(corpus.all_ids)))
    assert overlap_n == corpus.n_docs
예제 #2
0
def test_featurizer_and_data_gen():
    build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite')
    corpus = Corpus.load('/tmp/foo.sqlite')
    featurizer = features.Featurizer()
    featurizer.fit(corpus, max_df_frac=1.0)

    dg = features.DataGenerator(corpus, featurizer)
    gen = dg.triplet_generator(paper_ids=corpus.all_ids,
                               candidate_ids=corpus.all_ids,
                               batch_size=128,
                               neg_to_pos_ratio=5)

    # make sure we can get features
    for i in range(10):
        print(i)
        X, y = next(gen)

    # correct batch size
    assert len(y) >= 128
    # positives, hard negatives, easy negatives
    assert len(np.unique(y)) == 3
    # correct padding
    assert X['query-abstract-txt'].shape[1] == featurizer.max_abstract_len
    assert X['query-title-txt'].shape[1] == featurizer.max_title_len
    # no new words
    assert set(featurizer.word_indexer.word_to_index.keys()).difference(
        WORDS) == set()

    q, ex, labels = next(dg._listwise_examples(corpus.all_ids, corpus.all_ids))

    # query id should not be in candidates
    assert q.id not in [i.id for i in ex]

    # pos ids should be out_citations
    pos_docs = [i.id for i, j in zip(ex, labels) if j == np.max(labels)]
    assert set(pos_docs) == set(q.out_citations)

    # neg ids should be NOT out_citations
    neg_docs = [i.id for i, j in zip(ex, labels) if j < np.max(labels)]
    assert np.all([i not in neg_docs for i in q.out_citations])

    # test variable margin off
    dg = features.DataGenerator(corpus, featurizer, use_variable_margin=False)
    gen = dg.triplet_generator(paper_ids=corpus.all_ids,
                               candidate_ids=corpus.all_ids,
                               batch_size=128,
                               neg_to_pos_ratio=5)

    X, y = next(gen)
    print(dg.margins_offset_dict)
    assert len(np.unique(y)) == 2
예제 #3
0
    def main(self, args):
        dp = DatasetPaths()
        if self.dataset_type == 'oc':
            corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type))
        else:
            corpus = Corpus.load(dp.get_db_path(self.dataset_type))

        authors = Counter()
        key_phrases = Counter()
        years = Counter()
        venues = Counter()
        num_docs_with_kp = 0

        in_citations_counts = []
        out_citations_counts = []
        for doc in corpus:
            authors.update(doc.authors)
            key_phrases.update(doc.key_phrases)
            if len(doc.key_phrases) > 0:
                num_docs_with_kp += 1
            in_citations_counts.append(doc.in_citation_count)
            out_citations_counts.append(doc.out_citation_count)
            years.update([doc.year])
            venues.update([doc.venue])

        training_years = [corpus[doc_id].year for doc_id in corpus.train_ids]
        validation_years = [corpus[doc_id].year for doc_id in corpus.valid_ids]
        testing_years = [corpus[doc_id].year for doc_id in corpus.test_ids]

        print("No. of documents = {}".format(len(corpus)))
        print("Unique number of authors = {}".format(len(authors)))
        print("Unique number of key phrases = {}".format(len(key_phrases)))
        print("Unique number of venues = {}".format(len(venues)))
        print("No. of docs with key phrases = {}".format(num_docs_with_kp))
        print("Average in citations = {} (+/- {})".format(np.mean(in_citations_counts),
                                                          np.std(in_citations_counts)))
        print("Average out citations = {} (+/- {})".format(np.mean(out_citations_counts),
                                                           np.std(out_citations_counts)))
        print("No. of training examples = {} ({} to {})".format(len(corpus.train_ids),
                                                                np.min(training_years),
                                                                np.max(training_years)))
        print("No. of validation examples = {} ({} to {})".format(len(corpus.valid_ids),
                                                                  np.min(validation_years),
                                                                  np.max(validation_years)))
        print("No. of testing examples = {} ({} to {})".format(len(corpus.test_ids),
                                                               np.min(testing_years),
                                                               np.max(testing_years)))
        print(authors.most_common(10))
예제 #4
0
    def setUpClass(cls):
        build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite')
        corpus = Corpus.load('/tmp/foo.sqlite')

        options = ModelOptions(**{})

        featurizer = Featurizer(max_title_len=options.max_title_len,
                                max_abstract_len=options.max_abstract_len)
        featurizer.fit(corpus, max_df_frac=1.0)

        options.n_features = featurizer.n_features
        options.n_authors = featurizer.n_authors
        options.n_venues = featurizer.n_venues
        options.n_keyphrases = featurizer.n_keyphrases

        cls.corpus = corpus
        cls.featurizer = featurizer
        cls.options = options
예제 #5
0
def test_data_isolation():
    build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite')
    corpus = Corpus.load('/tmp/foo.sqlite')

    assert len(set(corpus.train_ids).intersection(set(corpus.valid_ids))) == 0
    assert len(set(corpus.train_ids).intersection(set(corpus.test_ids))) == 0
    assert len(set(corpus.valid_ids).intersection(set(corpus.test_ids))) == 0

    featurizer = features.Featurizer()
    featurizer.fit(corpus, max_df_frac=1.0)
    dg = features.DataGenerator(corpus, featurizer)

    query, examples, labels = next(dg._listwise_examples(corpus.train_ids))
    examples_ids = [doc.id for doc in examples]

    assert len(set(examples_ids).intersection(set(corpus.valid_ids))) == 0
    assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0

    dg = features.DataGenerator(corpus, featurizer)
    query, examples, labels = next(
        dg._listwise_examples(paper_ids=corpus.valid_ids,
                              candidate_ids=corpus.valid_ids +
                              corpus.train_ids))
    examples_ids = [doc.id for doc in examples]

    assert len(set(examples_ids).intersection(set(corpus.train_ids))) > 0
    assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0

    dg = features.DataGenerator(corpus, featurizer)
    query, examples, labels = next(
        dg._listwise_examples(paper_ids=corpus.test_ids,
                              candidate_ids=corpus.valid_ids +
                              corpus.train_ids))
    examples_ids = [doc.id for doc in examples]
    assert len(set(examples_ids).intersection(set(corpus.test_ids))) == 0

    dg = features.DataGenerator(corpus, featurizer)
    query, examples, labels = next(
        dg._listwise_examples(paper_ids=corpus.test_ids,
                              candidate_ids=corpus.valid_ids +
                              corpus.train_ids + corpus.test_ids))
    examples_ids = [doc.id for doc in examples]
예제 #6
0
            FieldNames.PAPER_ID:
            obj['id'],
            FieldNames.TITLE_RAW:
            obj['title'],
            FieldNames.ABSTRACT_RAW:
            obj['paperAbstract'],
            FieldNames.AUTHORS: [a['name'] for a in obj['authors']],
            #FieldNames.IN_CITATION_COUNT: len(obj['inCitations']),
            FieldNames.KEY_PHRASES:
            obj['keyPhrases'],
            #FieldNames.OUT_CITATIONS: obj['outCitations'],
            FieldNames.URLS:
            obj['pdfUrls'],
            FieldNames.S2_URL:
            obj['s2Url'],
            FieldNames.VENUE:
            obj['venue'],
            FieldNames.YEAR:
            obj['year'],
            FieldNames.TITLE:
            ' '.join(global_tokenizer(obj['title'])),
            FieldNames.ABSTRACT:
            ' '.join(global_tokenizer(obj['paperAbstract']))
        }
        f.write(json.dumps(translated_obj))
        f.write("\n")
f.close()

oc_corpus = Corpus.load(dp.get_db_path('oc'))
pickle.dump(oc_corpus, open(output_pkl_path, 'wb'))
예제 #7
0
    def main(self, args):
        dp = DatasetPaths()
        if self.dataset_type == 'oc':
            corpus = Corpus.load_pkl(dp.get_pkl_path(self.dataset_type))
        else:
            corpus = Corpus.load(dp.get_db_path(self.dataset_type))

        if self.ranker_type == 'none':
            citation_ranker = NoneRanker()
        elif self.ranker_type == 'neural':
            assert self.citation_ranker_dir is not None
            ranker_featurizer, ranker_models = model_from_directory(
                self.citation_ranker_dir, on_cpu=True)
            citation_ranker = Ranker(
                corpus=corpus,
                featurizer=ranker_featurizer,
                citation_ranker=ranker_models['citeomatic'],
                num_candidates_to_rank=100)
        else:
            assert False

        candidate_results_map = {}
        if self.num_candidates is None:
            if self.dataset_type == 'oc':
                num_candidates_list = [100]
            else:
                num_candidates_list = [1, 5, 10, 15, 25, 50, 75, 100]
        else:
            num_candidates_list = [self.num_candidates]

        for num_candidates in num_candidates_list:

            if self.candidate_selector_type == 'bm25':
                index_path = dp.get_bm25_index_path(self.dataset_type)
                candidate_selector = BM25CandidateSelector(
                    corpus, index_path, num_candidates, False)
            elif self.candidate_selector_type == 'ann':
                assert self.paper_embedder_dir is not None
                featurizer, models = model_from_directory(
                    self.paper_embedder_dir, on_cpu=True)
                candidate_selector = self._make_ann_candidate_selector(
                    corpus=corpus,
                    featurizer=featurizer,
                    embedding_model=models['embedding'],
                    num_candidates=num_candidates)
            elif self.candidate_selector_type == 'oracle':
                candidate_selector = OracleCandidateSelector(corpus)
            else:
                assert False

            results = eval_text_model(corpus,
                                      candidate_selector,
                                      citation_ranker,
                                      papers_source=self.split,
                                      n_eval=self.n_eval)
            candidate_results_map[num_candidates] = results

        best_k = -1
        best_metric = 0.0
        metric_key = self.metric + "_1"
        for k, v in candidate_results_map.items():
            if best_metric < v[metric_key][EVAL_DATASET_KEYS[
                    self.dataset_type]]:
                best_k = k
                best_metric = v[metric_key][EVAL_DATASET_KEYS[
                    self.dataset_type]]

        print(json.dumps(candidate_results_map, indent=4, sort_keys=True))
        print(best_k)
        print(best_metric)
예제 #8
0
def end_to_end_training(model_options: ModelOptions,
                        dataset_type,
                        models_dir,
                        models_ann_dir=None):
    # step 1: make the directory
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    # step 2: load the corpus DB
    print("Loading corpus db...")
    dp = DatasetPaths()
    db_file = dp.get_db_path(dataset_type)
    json_file = dp.get_json_path(dataset_type)
    if not os.path.isfile(db_file):
        print(
            "Have to build the database! This may take a while, but should only happen once."
        )
        Corpus.build(db_file, json_file)

    if dataset_type == 'oc':
        corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type))
    else:
        corpus = Corpus.load(db_file, model_options.train_frac)

    # step 3: load/make the featurizer (once per hyperopt run)
    print("Making feautrizer")
    featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_'

    featurizer_file = os.path.join(
        models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME)

    if os.path.isfile(featurizer_file):
        featurizer = file_util.read_pickle(featurizer_file)
    else:
        featurizer = Featurizer(
            max_features=model_options.max_features,
            max_title_len=model_options.max_title_len,
            max_abstract_len=model_options.max_abstract_len,
            use_pretrained=model_options.use_pretrained,
            min_author_papers=model_options.min_author_papers,
            min_venue_papers=model_options.min_venue_papers,
            min_keyphrase_papers=model_options.min_keyphrase_papers)
        featurizer.fit(corpus,
                       is_featurizer_for_test=model_options.train_for_test_set)
        file_util.write_pickle(featurizer_file, featurizer)

    # update model options after featurization
    model_options.n_authors = featurizer.n_authors
    model_options.n_venues = featurizer.n_venues
    model_options.n_keyphrases = featurizer.n_keyphrases
    model_options.n_features = featurizer.n_features
    if model_options.use_pretrained:
        model_options.dense_dim = model_options.dense_dim_pretrained

    # step 4: train the model
    citeomatic_model, embedding_model = train_text_model(
        corpus,
        featurizer,
        model_options,
        models_ann_dir=models_ann_dir,
        debug=True,
        tensorboard_dir=None)

    # step 5: save the model
    citeomatic_model.save_weights(os.path.join(models_dir,
                                               dp.CITEOMATIC_WEIGHTS_FILENAME),
                                  overwrite=True)

    if embedding_model is not None:
        embedding_model.save_weights(os.path.join(
            models_dir, dp.EMBEDDING_WEIGHTS_FILENAME),
                                     overwrite=True)

    file_util.write_json(
        os.path.join(models_dir, dp.OPTIONS_FILENAME),
        model_options.to_json(),
    )

    return corpus, featurizer, model_options, citeomatic_model, embedding_model