def run(self): featurizer = file_util.read_pickle(self.input()['featurizer'].path) corpus = corpus.Corpus.load(self.input()['corpus'].path) model_options = ModelOptions.load(self.model_config) model_options.n_authors = featurizer.n_authors model_options.n_features = featurizer.n_features citeomatic_model, embedding_model = train_text_model( corpus, featurizer, model_options, embedding_model_for_ann=None, debug=False, tensorboard_dir=None ) self.output().makedirs() citeomatic_model.save_weights( path.join(self.output().path, 'weights.h5'), overwrite=True ) embedding_model.save_weights( path.join(self.output().path, 'embedding.h5'), overwrite=True ) file_util.write_json( model_options.to_json(), path.join(self.output().path, 'options.json') )
def test_pre_trained_layer(self): with h5py.File(EMBEDDINGS_FILE, 'r') as f: pretrained_embeddings = f['embedding'][...] options = ModelOptions() options.use_pretrained = True options.dense_dim = 300 options.n_features = 200 t_embedding_sum = TextEmbeddingSum( options=options, pretrained_embeddings=pretrained_embeddings, magnitudes_initializer='ones') embedding_model, outputs = t_embedding_sum.create_text_embedding_model( prefix='test', final_l2_norm=False) idx = random.randint(0, 200) pred = embedding_model.predict(np.asarray([idx + 1]))[0] input_embedding = normalize(pretrained_embeddings[idx].reshape(1, -1))[0] assert all(map(almost_equal, pred, input_embedding))
def setUpClass(cls): build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite') corpus = Corpus.load('/tmp/foo.sqlite') options = ModelOptions(**{}) featurizer = Featurizer(max_title_len=options.max_title_len, max_abstract_len=options.max_abstract_len) featurizer.fit(corpus, max_df_frac=1.0) options.n_features = featurizer.n_features options.n_authors = featurizer.n_authors options.n_venues = featurizer.n_venues options.n_keyphrases = featurizer.n_keyphrases cls.corpus = corpus cls.featurizer = featurizer cls.options = options
def model_from_directory(dirname: str, on_cpu=False) -> Tuple[Featurizer, Any]: dp = DatasetPaths() options_json = file_util.read_json( os.path.join(dirname, dp.OPTIONS_FILENAME), ) options = ModelOptions(**json.loads(options_json)) featurizer_file_prefix = 'pretrained_' if options.use_pretrained else 'corpus_fit_' featurizer = file_util.read_pickle( os.path.join(dirname, featurizer_file_prefix + dp.FEATURIZER_FILENAME)) # type: Featurizer options.n_authors = featurizer.n_authors options.n_features = featurizer.n_features options.n_venues = featurizer.n_venues options.n_keyphrases = featurizer.n_keyphrases create_model = import_from('citeomatic.models.%s' % options.model_name, 'create_model') if on_cpu: with tf.device('/cpu:0'): models = create_model(options) else: models = create_model(options) print("Loading model from %s " % dirname) print(models['citeomatic'].summary()) if dirname.startswith('s3://'): models['citeomatic'].load_weights( file_util.cache_file( os.path.join(dirname, dp.CITEOMATIC_WEIGHTS_FILENAME))) models['embedding'].load_weights( file_util.cache_file( os.path.join(dirname, dp.EMBEDDING_WEIGHTS_FILENAME))) else: models['citeomatic'].load_weights( os.path.join(dirname, dp.CITEOMATIC_WEIGHTS_FILENAME)) if models['embedding'] is not None: models['embedding'].load_weights( os.path.join(dirname, dp.EMBEDDING_WEIGHTS_FILENAME)) return featurizer, models
def main(self, args): if self.input_config_file is None: base_config = ModelOptions().to_json() else: base_config = json.load(open(self.input_config_file)) changes_file_list = [ ({ 'use_citations': False, 'use_selector_confidence': False }, "{}.citation_ranker.canonical-extra_features.options.json".format( self.dataset_type)), ({ 'use_magdir': False }, "{}.citation_ranker.canonical-magdir.options.json".format( self.dataset_type)), ({ 'use_variable_margin': False }, "{}.citation_ranker.canonical-var_margin.options.json".format( self.dataset_type)), ({ 'use_metadata': False, 'use_authors': False, 'use_keyphrases': False, 'use_venue': False, }, "{}.citation_ranker.canonical-metadata.options.json".format( self.dataset_type)), ({ 'use_src_tgt_embeddings': True }, "{}.citation_ranker.canonical-siamese.options.json".format( self.dataset_type)), ({ 'use_src_tgt_embeddings': False }, "{}.citation_ranker.canonical-non_siamese.options.json".format( self.dataset_type)), ({ 'use_pretrained': True, 'enable_fine_tune': False }, "{}.citation_ranker.canonical-pretrained_no_finetune.options.json" .format(self.dataset_type)), ({ 'use_pretrained': True, 'enable_fine_tune': True }, "{}.citation_ranker.canonical-pretrained_with_finetune.options.json" .format(self.dataset_type)), ({ 'use_sparse': False }, "{}.citation_ranker.canonical-sparse.options.json".format( self.dataset_type)), ({ 'batch_size': 512 }, "{}.citation_ranker.canonical-large_batch.options.json".format( self.dataset_type)), ({ 'use_nn_negatives': False }, "{}.citation_ranker.canonical-nn_negatives.options.json".format( self.dataset_type)), ({ 'embedding_type': 'cnn2' }, "{}.citation_ranker.canonical+cnn.options.json".format( self.dataset_type)) ] for change, filename in changes_file_list: self.write_change_to_file(filename=filename, base_options=base_config, change=change)
def end_to_end_training(model_options: ModelOptions, dataset_type, models_dir, models_ann_dir=None): # step 1: make the directory if not os.path.exists(models_dir): os.makedirs(models_dir) # step 2: load the corpus DB print("Loading corpus db...") dp = DatasetPaths() db_file = dp.get_db_path(dataset_type) json_file = dp.get_json_path(dataset_type) if not os.path.isfile(db_file): print( "Have to build the database! This may take a while, but should only happen once." ) Corpus.build(db_file, json_file) if dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type)) else: corpus = Corpus.load(db_file, model_options.train_frac) # step 3: load/make the featurizer (once per hyperopt run) print("Making feautrizer") featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_' featurizer_file = os.path.join( models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME) if os.path.isfile(featurizer_file): featurizer = file_util.read_pickle(featurizer_file) else: featurizer = Featurizer( max_features=model_options.max_features, max_title_len=model_options.max_title_len, max_abstract_len=model_options.max_abstract_len, use_pretrained=model_options.use_pretrained, min_author_papers=model_options.min_author_papers, min_venue_papers=model_options.min_venue_papers, min_keyphrase_papers=model_options.min_keyphrase_papers) featurizer.fit(corpus, is_featurizer_for_test=model_options.train_for_test_set) file_util.write_pickle(featurizer_file, featurizer) # update model options after featurization model_options.n_authors = featurizer.n_authors model_options.n_venues = featurizer.n_venues model_options.n_keyphrases = featurizer.n_keyphrases model_options.n_features = featurizer.n_features if model_options.use_pretrained: model_options.dense_dim = model_options.dense_dim_pretrained # step 4: train the model citeomatic_model, embedding_model = train_text_model( corpus, featurizer, model_options, models_ann_dir=models_ann_dir, debug=True, tensorboard_dir=None) # step 5: save the model citeomatic_model.save_weights(os.path.join(models_dir, dp.CITEOMATIC_WEIGHTS_FILENAME), overwrite=True) if embedding_model is not None: embedding_model.save_weights(os.path.join( models_dir, dp.EMBEDDING_WEIGHTS_FILENAME), overwrite=True) file_util.write_json( os.path.join(models_dir, dp.OPTIONS_FILENAME), model_options.to_json(), ) return corpus, featurizer, model_options, citeomatic_model, embedding_model
def train_text_model( corpus: Corpus, featurizer: Featurizer, model_options: ModelOptions, models_ann_dir=None, debug=False, tensorboard_dir=None, ): """ Utility function for training citeomatic models. """ # load pretrained embeddings if model_options.use_pretrained: dp = DatasetPaths() pretrained_embeddings_file = dp.embeddings_weights_for_corpus('shared') with h5py.File(pretrained_embeddings_file, 'r') as f: pretrained_embeddings = f['embedding'][...] else: pretrained_embeddings = None create_model = import_from( 'citeomatic.models.%s' % model_options.model_name, 'create_model') models = create_model(model_options, pretrained_embeddings) model, embedding_model = models['citeomatic'], models['embedding'] logging.info(model.summary()) if model_options.train_for_test_set: paper_ids_for_training = corpus.train_ids + corpus.valid_ids candidates_for_training = corpus.train_ids + corpus.valid_ids + corpus.test_ids else: paper_ids_for_training = corpus.train_ids candidates_for_training = corpus.train_ids + corpus.valid_ids training_dg = DataGenerator( corpus=corpus, featurizer=featurizer, margin_multiplier=model_options.margin_multiplier, use_variable_margin=model_options.use_variable_margin) training_generator = training_dg.triplet_generator( paper_ids=paper_ids_for_training, candidate_ids=candidates_for_training, batch_size=model_options.batch_size, neg_to_pos_ratio=model_options.neg_to_pos_ratio) validation_dg = DataGenerator( corpus=corpus, featurizer=featurizer, margin_multiplier=model_options.margin_multiplier, use_variable_margin=model_options.use_variable_margin) validation_generator = validation_dg.triplet_generator( paper_ids=corpus.valid_ids, candidate_ids=corpus.train_ids + corpus.valid_ids, batch_size=1024, neg_to_pos_ratio=model_options.neg_to_pos_ratio) if model_options.optimizer == 'tfopt': optimizer = TFOptimizer( tf.contrib.opt.LazyAdamOptimizer(learning_rate=model_options.lr)) else: optimizer = import_from('keras.optimizers', model_options.optimizer)(lr=model_options.lr) model.compile(optimizer=optimizer, loss=layers.triplet_loss) # training calculation model_options.samples_per_epoch = int( np.minimum(model_options.samples_per_epoch, model_options.total_samples)) epochs = int( np.ceil(model_options.total_samples / model_options.samples_per_epoch)) steps_per_epoch = int(model_options.samples_per_epoch / model_options.batch_size) # callbacks callbacks_list = [] if debug: callbacks_list.append(MemoryUsageCallback()) if model_options.tb_dir is not None: callbacks_list.append( TensorBoard(log_dir=model_options.tb_dir, histogram_freq=1, write_graph=True)) if model_options.reduce_lr_flag: if model_options.optimizer != 'tfopt': callbacks_list.append( ReduceLROnPlateau(verbose=1, patience=2, epsilon=0.01, min_lr=1e-6, factor=0.5)) if models_ann_dir is None: ann_featurizer = featurizer paper_embedding_model = embedding_model embed_at_epoch_end = True embed_at_train_begin = False else: ann_featurizer, ann_models = model_from_directory(models_ann_dir, on_cpu=True) paper_embedding_model = ann_models['embedding'] paper_embedding_model._make_predict_function() embed_at_epoch_end = False embed_at_train_begin = True callbacks_list.append( UpdateANN(corpus, ann_featurizer, paper_embedding_model, training_dg, validation_dg, embed_at_epoch_end, embed_at_train_begin)) if model_options.tb_dir is None: validation_data = validation_generator else: validation_data = next(validation_generator) # logic model.fit_generator(generator=training_generator, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks_list, validation_data=validation_generator, validation_steps=10) return model, embedding_model
from citeomatic.toydatasetreader import ToyDatasetReader from citeomatic.testreader import TestReader from citeomatic.models.text_embedding import Text_Embedding from citeomatic.models.paper_embedding import Paper_Embedding from citeomatic.models.embeddingmodel import EmbeddingModel from citeomatic.models.options import ModelOptions from citeomatic.models.citationranker import CitationRanker #just using toydatasetreader to build vocab reader = ToyDatasetReader() dataset = reader.read("") vocab = Vocabulary.from_instances(dataset) print(vocab.get_vocab_size()) opts = ModelOptions() reader = TestReader(vocab) reader.set_compute_nnrank_features(True) dataset = reader.read("") text_embedder = Text_Embedding(opts, vocab) nnrank = CitationRanker(vocab, opts, text_embedder) iterator = BasicIterator() iterator.index_with(vocab) optimizer = torch.optim.SGD(nnrank.parameters(), lr=0.1) move_optimizer_to_cuda(optimizer) trainer = Trainer(model=nnrank,
def train_and_evaluate(self, eval_params): # Needed especially for hyperopt runs K.clear_session() model_kw = { name: getattr(self, name) for name in ModelOptions.class_traits().keys() } model_kw.update(eval_params) model_options = ModelOptions(**model_kw) if model_options.use_metadata: model_options.use_keyphrases = True model_options.use_authors = True model_options.use_venue = True print("====== OPTIONS =====") print(model_options) print("======") if model_options.train_for_test_set: logging.info( "\n\n============== TRAINING FOR TEST SET =============\n\n") training_outputs = end_to_end_training(model_options, self.dataset_type, self.models_dir, self.models_ann_dir) corpus, featurizer, model_options, citeomatic_model, embedding_model = training_outputs if self.candidate_selector_type == 'ann': # if no ann_dir is provided, then we use the model that was just trained # and have to rebuild the ANN if self.models_ann_dir is None: print( 'Using embedding model that was just trained for eval. Building...' ) paper_embedding_model = EmbeddingModel(featurizer, embedding_model) self.ann = ANN.build(paper_embedding_model, corpus) # if a dir is provided, then go ahead and load it else: featurizer_for_ann, ann_models = model_from_directory( self.models_ann_dir, on_cpu=True) paper_embedding_model = EmbeddingModel(featurizer_for_ann, ann_models['embedding']) # the ANN itself needs to be only built once if self.ann is None: if corpus.corpus_type == 'oc' and os.path.exists( DatasetPaths.OC_ANN_FILE + ".pickle"): self.ann = ANN.load(DatasetPaths.OC_ANN_FILE) else: self.ann = ANN.build(paper_embedding_model, corpus) candidate_selector = ANNCandidateSelector( corpus=corpus, ann=self.ann, paper_embedding_model=paper_embedding_model, top_k=model_options.num_ann_nbrs_to_fetch, extend_candidate_citations=model_options. extend_candidate_citations) elif self.candidate_selector_type == 'bm25': dp = DatasetPaths() candidate_selector = BM25CandidateSelector( corpus=corpus, index_path=dp.get_bm25_index_path(self.dataset_type), top_k=model_options.num_ann_nbrs_to_fetch, extend_candidate_citations=model_options. extend_candidate_citations) else: # Should not come here. Adding this to make pycharm happy. assert False if self.citation_ranker_type == 'neural': ranker = Ranker( corpus=corpus, featurizer=featurizer, citation_ranker=citeomatic_model, num_candidates_to_rank=model_options.num_candidates_to_rank) elif self.citation_ranker_type == 'none': ranker = NoneRanker() else: # Should not come here. Adding this to make pycharm happy. assert False if self.mode != 'hyperopt' or model_options.total_samples == self.total_samples_secondary: results_training = eval_text_model(corpus, candidate_selector, ranker, papers_source='train', n_eval=self.n_eval) else: results_training = {} results_validation = eval_text_model(corpus, candidate_selector, ranker, papers_source='valid', n_eval=self.n_eval) logging.info("===== Validation Results ===== ") logging.info("Validation Precision\n\n{}".format( results_validation['precision_1'])) logging.info("Validation Recall\n\n{}".format( results_validation['recall_1'])) p = results_validation['precision_1'][EVAL_DATASET_KEYS[ self.dataset_type]] r = results_validation['recall_1'][EVAL_DATASET_KEYS[ self.dataset_type]] f1 = results_validation['f1_1'][EVAL_DATASET_KEYS[self.dataset_type]] if self.model_name == PAPER_EMBEDDING_MODEL: # optimizing for recall l = -r else: # optimizing for F1 l = -f1 out = { 'loss': l, # have to negate since we're minimizing 'losses_training': results_training, 'losses_validation': results_validation, 'status': STATUS_FAIL if np.isnan(f1) else STATUS_OK, 'params': eval_params } return out