Exemple #1
0
    def setUpClass(cls):
        build_test_corpus('/tmp/foo.json', '/tmp/foo.sqlite')
        corpus = Corpus.load('/tmp/foo.sqlite')

        options = ModelOptions(**{})

        featurizer = Featurizer(max_title_len=options.max_title_len,
                                max_abstract_len=options.max_abstract_len)
        featurizer.fit(corpus, max_df_frac=1.0)

        options.n_features = featurizer.n_features
        options.n_authors = featurizer.n_authors
        options.n_venues = featurizer.n_venues
        options.n_keyphrases = featurizer.n_keyphrases

        cls.corpus = corpus
        cls.featurizer = featurizer
        cls.options = options
Exemple #2
0
def model_from_directory(dirname: str, on_cpu=False) -> Tuple[Featurizer, Any]:
    dp = DatasetPaths()

    options_json = file_util.read_json(
        os.path.join(dirname, dp.OPTIONS_FILENAME), )
    options = ModelOptions(**json.loads(options_json))

    featurizer_file_prefix = 'pretrained_' if options.use_pretrained else 'corpus_fit_'

    featurizer = file_util.read_pickle(
        os.path.join(dirname, featurizer_file_prefix +
                     dp.FEATURIZER_FILENAME))  # type: Featurizer

    options.n_authors = featurizer.n_authors
    options.n_features = featurizer.n_features
    options.n_venues = featurizer.n_venues
    options.n_keyphrases = featurizer.n_keyphrases
    create_model = import_from('citeomatic.models.%s' % options.model_name,
                               'create_model')
    if on_cpu:
        with tf.device('/cpu:0'):
            models = create_model(options)
    else:
        models = create_model(options)

    print("Loading model from %s " % dirname)
    print(models['citeomatic'].summary())
    if dirname.startswith('s3://'):
        models['citeomatic'].load_weights(
            file_util.cache_file(
                os.path.join(dirname, dp.CITEOMATIC_WEIGHTS_FILENAME)))
        models['embedding'].load_weights(
            file_util.cache_file(
                os.path.join(dirname, dp.EMBEDDING_WEIGHTS_FILENAME)))
    else:
        models['citeomatic'].load_weights(
            os.path.join(dirname, dp.CITEOMATIC_WEIGHTS_FILENAME))
        if models['embedding'] is not None:
            models['embedding'].load_weights(
                os.path.join(dirname, dp.EMBEDDING_WEIGHTS_FILENAME))
    return featurizer, models
Exemple #3
0
def end_to_end_training(model_options: ModelOptions,
                        dataset_type,
                        models_dir,
                        models_ann_dir=None):
    # step 1: make the directory
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    # step 2: load the corpus DB
    print("Loading corpus db...")
    dp = DatasetPaths()
    db_file = dp.get_db_path(dataset_type)
    json_file = dp.get_json_path(dataset_type)
    if not os.path.isfile(db_file):
        print(
            "Have to build the database! This may take a while, but should only happen once."
        )
        Corpus.build(db_file, json_file)

    if dataset_type == 'oc':
        corpus = Corpus.load_pkl(dp.get_pkl_path(dataset_type))
    else:
        corpus = Corpus.load(db_file, model_options.train_frac)

    # step 3: load/make the featurizer (once per hyperopt run)
    print("Making feautrizer")
    featurizer_file_prefix = 'pretrained_' if model_options.use_pretrained else 'corpus_fit_'

    featurizer_file = os.path.join(
        models_dir, featurizer_file_prefix + dp.FEATURIZER_FILENAME)

    if os.path.isfile(featurizer_file):
        featurizer = file_util.read_pickle(featurizer_file)
    else:
        featurizer = Featurizer(
            max_features=model_options.max_features,
            max_title_len=model_options.max_title_len,
            max_abstract_len=model_options.max_abstract_len,
            use_pretrained=model_options.use_pretrained,
            min_author_papers=model_options.min_author_papers,
            min_venue_papers=model_options.min_venue_papers,
            min_keyphrase_papers=model_options.min_keyphrase_papers)
        featurizer.fit(corpus,
                       is_featurizer_for_test=model_options.train_for_test_set)
        file_util.write_pickle(featurizer_file, featurizer)

    # update model options after featurization
    model_options.n_authors = featurizer.n_authors
    model_options.n_venues = featurizer.n_venues
    model_options.n_keyphrases = featurizer.n_keyphrases
    model_options.n_features = featurizer.n_features
    if model_options.use_pretrained:
        model_options.dense_dim = model_options.dense_dim_pretrained

    # step 4: train the model
    citeomatic_model, embedding_model = train_text_model(
        corpus,
        featurizer,
        model_options,
        models_ann_dir=models_ann_dir,
        debug=True,
        tensorboard_dir=None)

    # step 5: save the model
    citeomatic_model.save_weights(os.path.join(models_dir,
                                               dp.CITEOMATIC_WEIGHTS_FILENAME),
                                  overwrite=True)

    if embedding_model is not None:
        embedding_model.save_weights(os.path.join(
            models_dir, dp.EMBEDDING_WEIGHTS_FILENAME),
                                     overwrite=True)

    file_util.write_json(
        os.path.join(models_dir, dp.OPTIONS_FILENAME),
        model_options.to_json(),
    )

    return corpus, featurizer, model_options, citeomatic_model, embedding_model