Exemple #1
0
    def run(self):
        featurizer = file_util.read_pickle(self.input()['featurizer'].path)
        corpus = corpus.Corpus.load(self.input()['corpus'].path)

        model_options = ModelOptions.load(self.model_config)
        model_options.n_authors = featurizer.n_authors
        model_options.n_features = featurizer.n_features

        citeomatic_model, embedding_model = train_text_model(
            corpus,
            featurizer,
            model_options,
            embedding_model_for_ann=None,
            debug=False,
            tensorboard_dir=None
        )

        self.output().makedirs()
        citeomatic_model.save_weights(
            path.join(self.output().path, 'weights.h5'), overwrite=True
        )

        embedding_model.save_weights(
            path.join(self.output().path, 'embedding.h5'), overwrite=True
        )

        file_util.write_json(
            model_options.to_json(),
            path.join(self.output().path, 'options.json')
        )
    def test_pre_trained_layer(self):

        with h5py.File(EMBEDDINGS_FILE, 'r') as f:
            pretrained_embeddings = f['embedding'][...]

        options = ModelOptions()
        options.use_pretrained = True
        options.dense_dim = 300
        options.n_features = 200
        t_embedding_sum = TextEmbeddingSum(
            options=options,
            pretrained_embeddings=pretrained_embeddings,
            magnitudes_initializer='ones')

        embedding_model, outputs = t_embedding_sum.create_text_embedding_model(
            prefix='test', final_l2_norm=False)

        idx = random.randint(0, 200)

        pred = embedding_model.predict(np.asarray([idx + 1]))[0]
        input_embedding = normalize(pretrained_embeddings[idx].reshape(1,
                                                                       -1))[0]
        assert all(map(almost_equal, pred, input_embedding))
Exemple #3
0
    def setUpClass(cls):
        build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite')
        corpus = Corpus.load('/tmp/foo.sqlite')

        options = ModelOptions(**{})

        featurizer = Featurizer(max_title_len=options.max_title_len,
                                max_abstract_len=options.max_abstract_len)
        featurizer.fit(corpus, max_df_frac=1.0)

        options.n_features = featurizer.n_features
        options.n_authors = featurizer.n_authors
        options.n_venues = featurizer.n_venues
        options.n_keyphrases = featurizer.n_keyphrases

        cls.corpus = corpus
        cls.featurizer = featurizer
        cls.options = options
Exemple #4
0
def model_from_directory(dirname: str, on_cpu=False) -> Tuple[Featurizer, Any]:
    dp = DatasetPaths()

    options_json = file_util.read_json(
        os.path.join(dirname, dp.OPTIONS_FILENAME), )
    options = ModelOptions(**json.loads(options_json))

    featurizer_file_prefix = 'pretrained_' if options.use_pretrained else 'corpus_fit_'

    featurizer = file_util.read_pickle(
        os.path.join(dirname, featurizer_file_prefix +
                     dp.FEATURIZER_FILENAME))  # type: Featurizer

    options.n_authors = featurizer.n_authors
    options.n_features = featurizer.n_features
    options.n_venues = featurizer.n_venues
    options.n_keyphrases = featurizer.n_keyphrases
    create_model = import_from('citeomatic.models.%s' % options.model_name,
                               'create_model')
    if on_cpu:
        with tf.device('/cpu:0'):
            models = create_model(options)
    else:
        models = create_model(options)

    print("Loading model from %s " % dirname)
    print(models['citeomatic'].summary())
    if dirname.startswith('s3://'):
        models['citeomatic'].load_weights(
            file_util.cache_file(
                os.path.join(dirname, dp.CITEOMATIC_WEIGHTS_FILENAME)))
        models['embedding'].load_weights(
            file_util.cache_file(
                os.path.join(dirname, dp.EMBEDDING_WEIGHTS_FILENAME)))
    else:
        models['citeomatic'].load_weights(
            os.path.join(dirname, dp.CITEOMATIC_WEIGHTS_FILENAME))
        if models['embedding'] is not None:
            models['embedding'].load_weights(
                os.path.join(dirname, dp.EMBEDDING_WEIGHTS_FILENAME))
    return featurizer, models
    def main(self, args):
        if self.input_config_file is None:
            base_config = ModelOptions().to_json()
        else:
            base_config = json.load(open(self.input_config_file))

        changes_file_list = [
            ({
                'use_citations': False,
                'use_selector_confidence': False
            },
             "{}.citation_ranker.canonical-extra_features.options.json".format(
                 self.dataset_type)),
            ({
                'use_magdir': False
            }, "{}.citation_ranker.canonical-magdir.options.json".format(
                self.dataset_type)),
            ({
                'use_variable_margin': False
            }, "{}.citation_ranker.canonical-var_margin.options.json".format(
                self.dataset_type)),
            ({
                'use_metadata': False,
                'use_authors': False,
                'use_keyphrases': False,
                'use_venue': False,
            }, "{}.citation_ranker.canonical-metadata.options.json".format(
                self.dataset_type)),
            ({
                'use_src_tgt_embeddings': True
            }, "{}.citation_ranker.canonical-siamese.options.json".format(
                self.dataset_type)),
            ({
                'use_src_tgt_embeddings': False
            }, "{}.citation_ranker.canonical-non_siamese.options.json".format(
                self.dataset_type)),
            ({
                'use_pretrained': True,
                'enable_fine_tune': False
            },
             "{}.citation_ranker.canonical-pretrained_no_finetune.options.json"
             .format(self.dataset_type)),
            ({
                'use_pretrained': True,
                'enable_fine_tune': True
            },
             "{}.citation_ranker.canonical-pretrained_with_finetune.options.json"
             .format(self.dataset_type)),
            ({
                'use_sparse': False
            }, "{}.citation_ranker.canonical-sparse.options.json".format(
                self.dataset_type)),
            ({
                'batch_size': 512
            }, "{}.citation_ranker.canonical-large_batch.options.json".format(
                self.dataset_type)),
            ({
                'use_nn_negatives': False
            }, "{}.citation_ranker.canonical-nn_negatives.options.json".format(
                self.dataset_type)),
            ({
                'embedding_type': 'cnn2'
            }, "{}.citation_ranker.canonical+cnn.options.json".format(
                self.dataset_type))
        ]

        for change, filename in changes_file_list:
            self.write_change_to_file(filename=filename,
                                      base_options=base_config,
                                      change=change)
Exemple #6
0
def end_to_end_training(model_options: ModelOptions,
                        dataset_type,
                        models_dir,
                        models_ann_dir=None):
    # step 1: make the directory
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    # step 2: load the corpus DB
    print("Loading corpus db...")
    dp = DatasetPaths()
    db_file = dp.get_db_path(dataset_type)
    json_file = dp.get_json_path(dataset_type)
    if not os.path.isfile(db_file):
        print(
            "Have to build the database! This may take a while, but should only happen once."
        )
        Corpus.build(db_file, json_file)

    if dataset_type == 'oc':
        corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type))
    else:
        corpus = Corpus.load(db_file, model_options.train_frac)

    # step 3: load/make the featurizer (once per hyperopt run)
    print("Making feautrizer")
    featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_'

    featurizer_file = os.path.join(
        models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME)

    if os.path.isfile(featurizer_file):
        featurizer = file_util.read_pickle(featurizer_file)
    else:
        featurizer = Featurizer(
            max_features=model_options.max_features,
            max_title_len=model_options.max_title_len,
            max_abstract_len=model_options.max_abstract_len,
            use_pretrained=model_options.use_pretrained,
            min_author_papers=model_options.min_author_papers,
            min_venue_papers=model_options.min_venue_papers,
            min_keyphrase_papers=model_options.min_keyphrase_papers)
        featurizer.fit(corpus,
                       is_featurizer_for_test=model_options.train_for_test_set)
        file_util.write_pickle(featurizer_file, featurizer)

    # update model options after featurization
    model_options.n_authors = featurizer.n_authors
    model_options.n_venues = featurizer.n_venues
    model_options.n_keyphrases = featurizer.n_keyphrases
    model_options.n_features = featurizer.n_features
    if model_options.use_pretrained:
        model_options.dense_dim = model_options.dense_dim_pretrained

    # step 4: train the model
    citeomatic_model, embedding_model = train_text_model(
        corpus,
        featurizer,
        model_options,
        models_ann_dir=models_ann_dir,
        debug=True,
        tensorboard_dir=None)

    # step 5: save the model
    citeomatic_model.save_weights(os.path.join(models_dir,
                                               dp.CITEOMATIC_WEIGHTS_FILENAME),
                                  overwrite=True)

    if embedding_model is not None:
        embedding_model.save_weights(os.path.join(
            models_dir, dp.EMBEDDING_WEIGHTS_FILENAME),
                                     overwrite=True)

    file_util.write_json(
        os.path.join(models_dir, dp.OPTIONS_FILENAME),
        model_options.to_json(),
    )

    return corpus, featurizer, model_options, citeomatic_model, embedding_model
Exemple #7
0
def train_text_model(
    corpus: Corpus,
    featurizer: Featurizer,
    model_options: ModelOptions,
    models_ann_dir=None,
    debug=False,
    tensorboard_dir=None,
):
    """
    Utility function for training citeomatic models.
    """

    # load pretrained embeddings
    if model_options.use_pretrained:
        dp = DatasetPaths()
        pretrained_embeddings_file = dp.embeddings_weights_for_corpus('shared')
        with h5py.File(pretrained_embeddings_file, 'r') as f:
            pretrained_embeddings = f['embedding'][...]
    else:
        pretrained_embeddings = None

    create_model = import_from(
        'citeomatic.models.%s' % model_options.model_name, 'create_model')
    models = create_model(model_options, pretrained_embeddings)
    model, embedding_model = models['citeomatic'], models['embedding']

    logging.info(model.summary())

    if model_options.train_for_test_set:
        paper_ids_for_training = corpus.train_ids + corpus.valid_ids
        candidates_for_training = corpus.train_ids + corpus.valid_ids + corpus.test_ids
    else:
        paper_ids_for_training = corpus.train_ids
        candidates_for_training = corpus.train_ids + corpus.valid_ids

    training_dg = DataGenerator(
        corpus=corpus,
        featurizer=featurizer,
        margin_multiplier=model_options.margin_multiplier,
        use_variable_margin=model_options.use_variable_margin)
    training_generator = training_dg.triplet_generator(
        paper_ids=paper_ids_for_training,
        candidate_ids=candidates_for_training,
        batch_size=model_options.batch_size,
        neg_to_pos_ratio=model_options.neg_to_pos_ratio)

    validation_dg = DataGenerator(
        corpus=corpus,
        featurizer=featurizer,
        margin_multiplier=model_options.margin_multiplier,
        use_variable_margin=model_options.use_variable_margin)
    validation_generator = validation_dg.triplet_generator(
        paper_ids=corpus.valid_ids,
        candidate_ids=corpus.train_ids + corpus.valid_ids,
        batch_size=1024,
        neg_to_pos_ratio=model_options.neg_to_pos_ratio)

    if model_options.optimizer == 'tfopt':
        optimizer = TFOptimizer(
            tf.contrib.opt.LazyAdamOptimizer(learning_rate=model_options.lr))
    else:
        optimizer = import_from('keras.optimizers',
                                model_options.optimizer)(lr=model_options.lr)

    model.compile(optimizer=optimizer, loss=layers.triplet_loss)

    # training calculation
    model_options.samples_per_epoch = int(
        np.minimum(model_options.samples_per_epoch,
                   model_options.total_samples))
    epochs = int(
        np.ceil(model_options.total_samples / model_options.samples_per_epoch))
    steps_per_epoch = int(model_options.samples_per_epoch /
                          model_options.batch_size)

    # callbacks
    callbacks_list = []
    if debug:
        callbacks_list.append(MemoryUsageCallback())
    if model_options.tb_dir is not None:
        callbacks_list.append(
            TensorBoard(log_dir=model_options.tb_dir,
                        histogram_freq=1,
                        write_graph=True))
    if model_options.reduce_lr_flag:
        if model_options.optimizer != 'tfopt':
            callbacks_list.append(
                ReduceLROnPlateau(verbose=1,
                                  patience=2,
                                  epsilon=0.01,
                                  min_lr=1e-6,
                                  factor=0.5))

    if models_ann_dir is None:
        ann_featurizer = featurizer
        paper_embedding_model = embedding_model
        embed_at_epoch_end = True
        embed_at_train_begin = False
    else:
        ann_featurizer, ann_models = model_from_directory(models_ann_dir,
                                                          on_cpu=True)
        paper_embedding_model = ann_models['embedding']
        paper_embedding_model._make_predict_function()
        embed_at_epoch_end = False
        embed_at_train_begin = True
    callbacks_list.append(
        UpdateANN(corpus, ann_featurizer, paper_embedding_model, training_dg,
                  validation_dg, embed_at_epoch_end, embed_at_train_begin))

    if model_options.tb_dir is None:
        validation_data = validation_generator
    else:
        validation_data = next(validation_generator)

    # logic
    model.fit_generator(generator=training_generator,
                        steps_per_epoch=steps_per_epoch,
                        epochs=epochs,
                        callbacks=callbacks_list,
                        validation_data=validation_generator,
                        validation_steps=10)

    return model, embedding_model
from citeomatic.toydatasetreader import ToyDatasetReader
from citeomatic.testreader import TestReader
from citeomatic.models.text_embedding import Text_Embedding
from citeomatic.models.paper_embedding import Paper_Embedding
from citeomatic.models.embeddingmodel import EmbeddingModel
from citeomatic.models.options import ModelOptions
from citeomatic.models.citationranker import CitationRanker

#just using toydatasetreader to build vocab
reader = ToyDatasetReader()
dataset = reader.read("")
vocab = Vocabulary.from_instances(dataset)

print(vocab.get_vocab_size())

opts = ModelOptions()

reader = TestReader(vocab)
reader.set_compute_nnrank_features(True)
dataset = reader.read("")
text_embedder = Text_Embedding(opts, vocab)

nnrank = CitationRanker(vocab, opts, text_embedder)

iterator = BasicIterator()
iterator.index_with(vocab)

optimizer = torch.optim.SGD(nnrank.parameters(), lr=0.1)
move_optimizer_to_cuda(optimizer)

trainer = Trainer(model=nnrank,
Exemple #9
0
    def train_and_evaluate(self, eval_params):
        # Needed especially for hyperopt runs
        K.clear_session()

        model_kw = {
            name: getattr(self, name)
            for name in ModelOptions.class_traits().keys()
        }
        model_kw.update(eval_params)
        model_options = ModelOptions(**model_kw)

        if model_options.use_metadata:
            model_options.use_keyphrases = True
            model_options.use_authors = True
            model_options.use_venue = True

        print("====== OPTIONS =====")
        print(model_options)
        print("======")

        if model_options.train_for_test_set:
            logging.info(
                "\n\n============== TRAINING FOR TEST SET =============\n\n")

        training_outputs = end_to_end_training(model_options,
                                               self.dataset_type,
                                               self.models_dir,
                                               self.models_ann_dir)
        corpus, featurizer, model_options, citeomatic_model, embedding_model = training_outputs

        if self.candidate_selector_type == 'ann':
            # if no ann_dir is provided, then we use the model that was just trained
            # and have to rebuild the ANN
            if self.models_ann_dir is None:
                print(
                    'Using embedding model that was just trained for eval. Building...'
                )
                paper_embedding_model = EmbeddingModel(featurizer,
                                                       embedding_model)
                self.ann = ANN.build(paper_embedding_model, corpus)
            # if a dir is provided, then go ahead and load it
            else:
                featurizer_for_ann, ann_models = model_from_directory(
                    self.models_ann_dir, on_cpu=True)
                paper_embedding_model = EmbeddingModel(featurizer_for_ann,
                                                       ann_models['embedding'])
                # the ANN itself needs to be only built once
                if self.ann is None:
                    if corpus.corpus_type == 'oc' and os.path.exists(
                            DatasetPaths.OC_ANN_FILE + ".pickle"):
                        self.ann = ANN.load(DatasetPaths.OC_ANN_FILE)
                    else:
                        self.ann = ANN.build(paper_embedding_model, corpus)

            candidate_selector = ANNCandidateSelector(
                corpus=corpus,
                ann=self.ann,
                paper_embedding_model=paper_embedding_model,
                top_k=model_options.num_ann_nbrs_to_fetch,
                extend_candidate_citations=model_options.
                extend_candidate_citations)
        elif self.candidate_selector_type == 'bm25':
            dp = DatasetPaths()
            candidate_selector = BM25CandidateSelector(
                corpus=corpus,
                index_path=dp.get_bm25_index_path(self.dataset_type),
                top_k=model_options.num_ann_nbrs_to_fetch,
                extend_candidate_citations=model_options.
                extend_candidate_citations)
        else:
            # Should not come here. Adding this to make pycharm happy.
            assert False

        if self.citation_ranker_type == 'neural':
            ranker = Ranker(
                corpus=corpus,
                featurizer=featurizer,
                citation_ranker=citeomatic_model,
                num_candidates_to_rank=model_options.num_candidates_to_rank)
        elif self.citation_ranker_type == 'none':
            ranker = NoneRanker()
        else:
            # Should not come here. Adding this to make pycharm happy.
            assert False

        if self.mode != 'hyperopt' or model_options.total_samples == self.total_samples_secondary:
            results_training = eval_text_model(corpus,
                                               candidate_selector,
                                               ranker,
                                               papers_source='train',
                                               n_eval=self.n_eval)
        else:
            results_training = {}

        results_validation = eval_text_model(corpus,
                                             candidate_selector,
                                             ranker,
                                             papers_source='valid',
                                             n_eval=self.n_eval)

        logging.info("===== Validation Results ===== ")
        logging.info("Validation Precision\n\n{}".format(
            results_validation['precision_1']))
        logging.info("Validation Recall\n\n{}".format(
            results_validation['recall_1']))

        p = results_validation['precision_1'][EVAL_DATASET_KEYS[
            self.dataset_type]]
        r = results_validation['recall_1'][EVAL_DATASET_KEYS[
            self.dataset_type]]
        f1 = results_validation['f1_1'][EVAL_DATASET_KEYS[self.dataset_type]]

        if self.model_name == PAPER_EMBEDDING_MODEL:
            # optimizing for recall
            l = -r
        else:
            # optimizing for F1
            l = -f1

        out = {
            'loss': l,  # have to negate since we're minimizing
            'losses_training': results_training,
            'losses_validation': results_validation,
            'status': STATUS_FAIL if np.isnan(f1) else STATUS_OK,
            'params': eval_params
        }

        return out