Ejemplo n.º 1
0
 def __init__(
     self,
     models,
     featurizer: Featurizer,
     corpus: Corpus = None,
     ann: ANN = None,
     ann_embedding_model=None,
     max_neighbors=1000,
     candidate_min_in_citations=4,
 ):
     self.model = models['citeomatic']
     self.embedding_model = EmbeddingModel(featurizer, models['embedding']) if \
         ann_embedding_model is None else ann_embedding_model
     self.featurizer = featurizer
     self.explanation = None  # Explanation(self.model, featurizer)
     self._ann = ann
     self.corpus = corpus
     self.max_neighbors = max_neighbors
     self.candidate_min_in_citations = candidate_min_in_citations
Ejemplo n.º 2
0
    def _re_embed(self, load_pkl=False):
        embedder = EmbeddingModel(self.featurizer, self.embedding_model)
        if load_pkl and self.corpus.corpus_type == 'oc' and os.path.exists(
                DatasetPaths.OC_ANN_FILE + ".pickle"):
            ann = ANN.load(DatasetPaths.OC_ANN_FILE)
        else:
            ann = ANN.build(embedder, self.corpus, ann_trees=10)
        candidate_selector = ANNCandidateSelector(
            corpus=self.corpus,
            ann=ann,
            paper_embedding_model=embedder,
            top_k=100,
            extend_candidate_citations=False)
        self.training_data_generator.ann = ann
        self.training_data_generator.candidate_selector = candidate_selector

        self.validation_data_generator.ann = self.training_data_generator.ann
        self.validation_data_generator.candidate_selector = self.training_data_generator.candidate_selector
    def _init_model(self):
        featurizer, models = model_from_directory(self.model_dir)
        corpus = Corpus.load(self.corpus_path, featurizer.training_fraction)

        if self.filter_method == "ann":
            ann = ANN.load(self.ann_path)
            if self.ann_model_dir:
                featurizer_ann, models_ann = model_from_directory(
                    self.ann_model_dir)
            else:
                featurizer_ann, models_ann = featurizer, models

            ann_doc_embedding_model = EmbeddingModel(featurizer_ann,
                                                     models_ann['embedding'])
            api_model = APIModel(
                models,
                featurizer,
                ann=ann,
                ann_embedding_model=ann_doc_embedding_model,
                corpus=corpus,
                max_neighbors=self.max_neighbors,
                candidate_min_in_citations=self.candidate_min_in_citations,
                limit_candidate_to_train_ids=self.limit_candidate_to_train_ids,
                extend_candidate_citations=self.extend_candidate_citations,
                citation_source=self.citation_source)
        else:
            api_model = APIModel(
                models,
                featurizer,
                max_neighbors=self.max_neighbors,
                candidate_min_in_citations=self.candidate_min_in_citations,
                limit_candidate_to_train_ids=self.limit_candidate_to_train_ids,
                extend_candidate_citations=self.extend_candidate_citations,
                citation_source=self.citation_source)

        self.corpus = corpus
        self.model = api_model
        return corpus, api_model
Ejemplo n.º 4
0
    if num_threads:
        return tf.Session(config=tf.ConfigProto(
            gpu_options=gpu_options,
            intra_op_parallelism_threads=int(num_threads)))
    else:
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))


K.set_session(get_session())

setup_default_logging(logging.INFO)

featurizer, models = model_from_directory(os.environ['MODEL_PATH'])
if 'ANN_MODEL_PATH' in os.environ:
    featurizer, ann_models = model_from_directory(os.environ['ANN_MODEL_PATH'])
    ann_model = EmbeddingModel(featurizer, ann_models['embedding'])
    ann = ANN.load(os.environ['ANN_MODEL_PATH'] + '/citeomatic_ann')
    corpus = Corpus.load(os.environ['CORPUS_PATH'])
else:
    ann = None
    ann_model = None
    corpus = None

app = service.app
app.config['DEBUG'] = True
app.config['API_MODEL'] = service.APIModel(
    corpus=corpus,
    featurizer=featurizer,
    models=models,
    ann_embedding_model=ann_model,
    ann=ann,
Ejemplo n.º 5
0
 def embedder(self, featurizer, embedding_model) -> EmbeddingModel:
     if self._embedder is None:
         self._embedder = EmbeddingModel(featurizer, embedding_model)
     return self._embedder
Ejemplo n.º 6
0
class APIModel(object):
    def __init__(
        self,
        models,
        featurizer: Featurizer,
        corpus: Corpus = None,
        ann: ANN = None,
        ann_embedding_model=None,
        max_neighbors=1000,
        candidate_min_in_citations=4,
    ):
        self.model = models['citeomatic']
        self.embedding_model = EmbeddingModel(featurizer, models['embedding']) if \
            ann_embedding_model is None else ann_embedding_model
        self.featurizer = featurizer
        self.explanation = None  # Explanation(self.model, featurizer)
        self._ann = ann
        self.corpus = corpus
        self.max_neighbors = max_neighbors
        self.candidate_min_in_citations = candidate_min_in_citations

    def get_ann_similar_documents(self, doc, top_n=NUM_ANN_CANDIDATES):
        doc_embedded = self.embedding_model.embed(doc)
        return self._ann.get_nns_by_vector(doc_embedded, top_n)

    @staticmethod
    def _sha_to_url(sha):
        return "https://pdfs.semanticscholar.org/" + sha[0:4] + "/" + sha[
            4:] + ".pdf"

    def predict(self, doc, top_n=DEFAULT_NUM_CITATIONS) -> List[Prediction]:
        candidate_ids = self.get_ann_similar_documents(
            doc, top_n=self.max_neighbors)
        candidate_ids = [
            bulk_id for bulk_id in candidate_ids
            if self.corpus[bulk_id].in_citation_count >=
            self.candidate_min_in_citations
        ]

        # Extend the candidate set with their citations
        citations_of_candidates = []
        for id in candidate_ids:
            citations_of_candidates.extend(self.corpus[id].citations)
        candidate_ids = list(set(citations_of_candidates + candidate_ids))

        logging.info('Fetching %d documents ' % len(candidate_ids))
        candidates = [self.corpus[paper_id] for paper_id in candidate_ids]

        logging.info('Featurizing... %d documents ' % len(candidates))
        features = self.featurizer.transform_query_and_results(doc, candidates)
        logging.info('Predicting...')
        scores = self.model.predict(features, batch_size=64).flatten()
        best_matches = np.argsort(scores)[::-1]

        predictions = []
        for i, match_idx in enumerate(best_matches[:top_n]):
            if candidates[match_idx].title.lower() == doc.title.lower():
                continue
            predictions.append(
                Prediction(score=float(scores[match_idx]),
                           document=candidates[match_idx],
                           pdf=APIModel._sha_to_url(
                               str(candidates[match_idx].id)),
                           position=i,
                           explanation={},
                           cited=candidates[match_idx].title.lower()
                           in doc.citations))
        logging.info("Done! Found %s predictions." % len(predictions))
        return predictions
Ejemplo n.º 7
0
    def train_and_evaluate(self, eval_params):
        # Needed especially for hyperopt runs
        K.clear_session()

        model_kw = {
            name: getattr(self, name)
            for name in ModelOptions.class_traits().keys()
        }
        model_kw.update(eval_params)
        model_options = ModelOptions(**model_kw)

        if model_options.use_metadata:
            model_options.use_keyphrases = True
            model_options.use_authors = True
            model_options.use_venue = True

        print("====== OPTIONS =====")
        print(model_options)
        print("======")

        if model_options.train_for_test_set:
            logging.info(
                "\n\n============== TRAINING FOR TEST SET =============\n\n")

        training_outputs = end_to_end_training(model_options,
                                               self.dataset_type,
                                               self.models_dir,
                                               self.models_ann_dir)
        corpus, featurizer, model_options, citeomatic_model, embedding_model = training_outputs

        if self.candidate_selector_type == 'ann':
            # if no ann_dir is provided, then we use the model that was just trained
            # and have to rebuild the ANN
            if self.models_ann_dir is None:
                print(
                    'Using embedding model that was just trained for eval. Building...'
                )
                paper_embedding_model = EmbeddingModel(featurizer,
                                                       embedding_model)
                self.ann = ANN.build(paper_embedding_model, corpus)
            # if a dir is provided, then go ahead and load it
            else:
                featurizer_for_ann, ann_models = model_from_directory(
                    self.models_ann_dir, on_cpu=True)
                paper_embedding_model = EmbeddingModel(featurizer_for_ann,
                                                       ann_models['embedding'])
                # the ANN itself needs to be only built once
                if self.ann is None:
                    if corpus.corpus_type == 'oc' and os.path.exists(
                            DatasetPaths.OC_ANN_FILE + ".pickle"):
                        self.ann = ANN.load(DatasetPaths.OC_ANN_FILE)
                    else:
                        self.ann = ANN.build(paper_embedding_model, corpus)

            candidate_selector = ANNCandidateSelector(
                corpus=corpus,
                ann=self.ann,
                paper_embedding_model=paper_embedding_model,
                top_k=model_options.num_ann_nbrs_to_fetch,
                extend_candidate_citations=model_options.
                extend_candidate_citations)
        elif self.candidate_selector_type == 'bm25':
            dp = DatasetPaths()
            candidate_selector = BM25CandidateSelector(
                corpus=corpus,
                index_path=dp.get_bm25_index_path(self.dataset_type),
                top_k=model_options.num_ann_nbrs_to_fetch,
                extend_candidate_citations=model_options.
                extend_candidate_citations)
        else:
            # Should not come here. Adding this to make pycharm happy.
            assert False

        if self.citation_ranker_type == 'neural':
            ranker = Ranker(
                corpus=corpus,
                featurizer=featurizer,
                citation_ranker=citeomatic_model,
                num_candidates_to_rank=model_options.num_candidates_to_rank)
        elif self.citation_ranker_type == 'none':
            ranker = NoneRanker()
        else:
            # Should not come here. Adding this to make pycharm happy.
            assert False

        if self.mode != 'hyperopt' or model_options.total_samples == self.total_samples_secondary:
            results_training = eval_text_model(corpus,
                                               candidate_selector,
                                               ranker,
                                               papers_source='train',
                                               n_eval=self.n_eval)
        else:
            results_training = {}

        results_validation = eval_text_model(corpus,
                                             candidate_selector,
                                             ranker,
                                             papers_source='valid',
                                             n_eval=self.n_eval)

        logging.info("===== Validation Results ===== ")
        logging.info("Validation Precision\n\n{}".format(
            results_validation['precision_1']))
        logging.info("Validation Recall\n\n{}".format(
            results_validation['recall_1']))

        p = results_validation['precision_1'][EVAL_DATASET_KEYS[
            self.dataset_type]]
        r = results_validation['recall_1'][EVAL_DATASET_KEYS[
            self.dataset_type]]
        f1 = results_validation['f1_1'][EVAL_DATASET_KEYS[self.dataset_type]]

        if self.model_name == PAPER_EMBEDDING_MODEL:
            # optimizing for recall
            l = -r
        else:
            # optimizing for F1
            l = -f1

        out = {
            'loss': l,  # have to negate since we're minimizing
            'losses_training': results_training,
            'losses_validation': results_validation,
            'status': STATUS_FAIL if np.isnan(f1) else STATUS_OK,
            'params': eval_params
        }

        return out