def setUpClass(cls): build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite') corpus = Corpus.load('/tmp/foo.sqlite') options = ModelOptions(**{}) featurizer = Featurizer(max_title_len=options.max_title_len, max_abstract_len=options.max_abstract_len) featurizer.fit(corpus, max_df_frac=1.0) options.n_features = featurizer.n_features options.n_authors = featurizer.n_authors options.n_venues = featurizer.n_venues options.n_keyphrases = featurizer.n_keyphrases cls.corpus = corpus cls.featurizer = featurizer cls.options = options
def model_from_directory(dirname: str, on_cpu=False) -> Tuple[Featurizer, Any]: dp = DatasetPaths() options_json = file_util.read_json( os.path.join(dirname, dp.OPTIONS_FILENAME), ) options = ModelOptions(**json.loads(options_json)) featurizer_file_prefix = 'pretrained_' if options.use_pretrained else 'corpus_fit_' featurizer = file_util.read_pickle( os.path.join(dirname, featurizer_file_prefix + dp.FEATURIZER_FILENAME)) # type: Featurizer options.n_authors = featurizer.n_authors options.n_features = featurizer.n_features options.n_venues = featurizer.n_venues options.n_keyphrases = featurizer.n_keyphrases create_model = import_from('citeomatic.models.%s' % options.model_name, 'create_model') if on_cpu: with tf.device('/cpu:0'): models = create_model(options) else: models = create_model(options) print("Loading model from %s " % dirname) print(models['citeomatic'].summary()) if dirname.startswith('s3://'): models['citeomatic'].load_weights( file_util.cache_file( os.path.join(dirname, dp.CITEOMATIC_WEIGHTS_FILENAME))) models['embedding'].load_weights( file_util.cache_file( os.path.join(dirname, dp.EMBEDDING_WEIGHTS_FILENAME))) else: models['citeomatic'].load_weights( os.path.join(dirname, dp.CITEOMATIC_WEIGHTS_FILENAME)) if models['embedding'] is not None: models['embedding'].load_weights( os.path.join(dirname, dp.EMBEDDING_WEIGHTS_FILENAME)) return featurizer, models
def test_pre_trained_layer(self): with h5py.File(EMBEDDINGS_FILE, 'r') as f: pretrained_embeddings = f['embedding'][...] options = ModelOptions() options.use_pretrained = True options.dense_dim = 300 options.n_features = 200 t_embedding_sum = TextEmbeddingSum( options=options, pretrained_embeddings=pretrained_embeddings, magnitudes_initializer='ones') embedding_model, outputs = t_embedding_sum.create_text_embedding_model( prefix='test', final_l2_norm=False) idx = random.randint(0, 200) pred = embedding_model.predict(np.asarray([idx + 1]))[0] input_embedding = normalize(pretrained_embeddings[idx].reshape(1, -1))[0] assert all(map(almost_equal, pred, input_embedding))
def end_to_end_training(model_options: ModelOptions, dataset_type, models_dir, models_ann_dir=None): # step 1: make the directory if not os.path.exists(models_dir): os.makedirs(models_dir) # step 2: load the corpus DB print("Loading corpus db...") dp = DatasetPaths() db_file = dp.get_db_path(dataset_type) json_file = dp.get_json_path(dataset_type) if not os.path.isfile(db_file): print( "Have to build the database! This may take a while, but should only happen once." ) Corpus.build(db_file, json_file) if dataset_type == 'oc': corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type)) else: corpus = Corpus.load(db_file, model_options.train_frac) # step 3: load/make the featurizer (once per hyperopt run) print("Making feautrizer") featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_' featurizer_file = os.path.join( models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME) if os.path.isfile(featurizer_file): featurizer = file_util.read_pickle(featurizer_file) else: featurizer = Featurizer( max_features=model_options.max_features, max_title_len=model_options.max_title_len, max_abstract_len=model_options.max_abstract_len, use_pretrained=model_options.use_pretrained, min_author_papers=model_options.min_author_papers, min_venue_papers=model_options.min_venue_papers, min_keyphrase_papers=model_options.min_keyphrase_papers) featurizer.fit(corpus, is_featurizer_for_test=model_options.train_for_test_set) file_util.write_pickle(featurizer_file, featurizer) # update model options after featurization model_options.n_authors = featurizer.n_authors model_options.n_venues = featurizer.n_venues model_options.n_keyphrases = featurizer.n_keyphrases model_options.n_features = featurizer.n_features if model_options.use_pretrained: model_options.dense_dim = model_options.dense_dim_pretrained # step 4: train the model citeomatic_model, embedding_model = train_text_model( corpus, featurizer, model_options, models_ann_dir=models_ann_dir, debug=True, tensorboard_dir=None) # step 5: save the model citeomatic_model.save_weights(os.path.join(models_dir, dp.CITEOMATIC_WEIGHTS_FILENAME), overwrite=True) if embedding_model is not None: embedding_model.save_weights(os.path.join( models_dir, dp.EMBEDDING_WEIGHTS_FILENAME), overwrite=True) file_util.write_json( os.path.join(models_dir, dp.OPTIONS_FILENAME), model_options.to_json(), ) return corpus, featurizer, model_options, citeomatic_model, embedding_model