Esempio n. 1
0
def predict_coreness(title, abstract):
    """
    Predicts class-wise probabilities given the title and abstract.
    """
    text = title + ' <ENDTITLE> ' + abstract
    categories = ['rejected', 'non_core', 'core']
    try:
        classifier = Classifier(
            data_itos_path=path_for('data_itos'),
            number_of_classes=3,
            cuda_device_id=current_app.config['CLASSIFIER_CUDA_DEVICE_ID'])
    except IOError as error:
        raise IOError('Data ITOS not found.') from error

    try:
        classifier.load_trained_classifier_weights(
            path_for('trained_classifier'))
    except IOError as error:
        raise IOError(
            'Could not load the trained classifier weights.') from error

    class_probabilities = classifier.predict(text)
    assert len(class_probabilities) == 3

    predicted_class = categories[np.argmax(class_probabilities)]
    output_dict = {'prediction': predicted_class}
    output_dict['scores'] = dict(zip(categories, class_probabilities))

    return output_dict
def test_predict_coreness(trained_pipeline):
    assert path_for('data_itos').exists()
    assert path_for('trained_classifier').exists()
    output_dict = predict_coreness(title=TEST_TITLE, abstract=TEST_ABSTRACT)

    assert set(output_dict.keys()) == {'prediction', 'scores'}
    assert output_dict['prediction'] in {'rejected', 'non_core', 'core'}
    assert set(
        output_dict['scores'].keys()) == {'rejected', 'non_core', 'core'}
    assert isclose(output_dict['scores']['rejected'] +
                   output_dict['scores']['non_core'] +
                   output_dict['scores']['core'],
                   1.0,
                   abs_tol=1e-2)
Esempio n. 3
0
def trained_pipeline(app, tmp_path_factory):
    app.config['CLASSIFIER_BASE_PATH'] = tmp_path_factory
    create_directories()
    shutil.copy(
        Path(__file__).parent / 'fixtures' / 'inspire_test_data.df',
        path_for('dataframe'))
    train()
Esempio n. 4
0
def preprocess_and_save_data():
    """
    Prepares the data for training.
    """
    try:
        split_and_save_data_for_language_model_and_classifier(
            dataframe_path=path_for('dataframe'),
            language_model_data_dir=path_for('language_model_data'),
            classifier_data_dir=path_for('classifier_data'),
            val_fraction=current_app.
            config['CLASSIFIER_VALIDATION_DATA_FRACTION'])
    except IOError as error:
        raise IOError(
            'Training dataframe not found. Make sure the file is present in the right directory. '
            'Please use the path specified in config.py for CLASSIFIER_DATAFRAME_PATH relative to the '
            'CLASSIFIER_BASE_PATH.') from error

    try:
        generate_and_save_language_model_tokens(
            language_model_data_dir=path_for('language_model_data'))
    except IOError as error:
        raise IOError(
            'Language Model data directory does not exist.') from error

    try:
        map_and_save_tokens_to_ids_for_language_model(
            language_model_data_dir=path_for('language_model_data'),
            data_itos_path=path_for('data_itos'),
            max_vocab_size=current_app.
            config['CLASSIFIER_MAXIMUM_VOCABULARY_SIZE'],
            minimum_frequency=current_app.
            config['CLASSIFIER_MINIMUM_WORD_FREQUENCY'])
    except IOError as error:
        raise IOError(
            'Language Model data directory or the data directory do not exist.'
        ) from error

    try:
        generate_and_save_classifier_tokens(
            classifier_data_dir=path_for('classifier_data'))
    except IOError as error:
        raise IOError('Classifier data directory does not exist.') from error

    try:
        map_and_save_tokens_to_ids_for_classifier(
            classifier_data_dir=path_for('classifier_data'),
            data_itos_path=path_for('data_itos'))
    except IOError as error:
        raise IOError(
            'Classifier data directory or the data ITOS does not exist.'
        ) from error
Esempio n. 5
0
def create_directories():
    """Create the project data and model directories"""
    path_for('classifier_data').mkdir(parents=True, exist_ok=True)
    path_for('language_model_data').mkdir(parents=True, exist_ok=True)
    path_for('classifier_model').mkdir(parents=True, exist_ok=True)
    (path_for('language_model') / 'wikitext_103').mkdir(parents=True,
                                                        exist_ok=True)
Esempio n. 6
0
def train_and_save_classifier():
    """
    Trains the classifier on our dataset and save the weights.
    """
    try:
        classifier = Classifier(
            data_itos_path=path_for('data_itos'),
            number_of_classes=3,
            cuda_device_id=current_app.config['CLASSIFIER_CUDA_DEVICE_ID'])
    except IOError as error:
        raise IOError('Data ITOS not found.') from error

    try:
        classifier.load_training_and_validation_data(
            training_data_ids_path=path_for('classifier_data') /
            'training_token_ids.npy',
            training_data_labels_path=path_for('classifier_data') /
            'training_labels.npy',
            validation_data_ids_path=path_for('classifier_data') /
            'validation_token_ids.npy',
            validation_data_labels_path=path_for('classifier_data') /
            'validation_labels.npy',
            classifier_data_dir=path_for('classifier_data'),
            batch_size=current_app.config['CLASSIFIER_CLASSIFIER_BATCH_SIZE'])
    except IOError as error:
        raise IOError('Training and Validation data for Classifier not found.'
                      ) from error

    classifier.initialize_learner()

    try:
        classifier.load_finetuned_language_model_weights(
            finetuned_language_model_encoder_path=path_for(
                'finetuned_language_model_encoder'))
    except IOError as error:
        raise IOError(
            'Finetuned Language Model Encoder does not exist.') from error

    try:
        classifier.train(
            trained_classifier_save_path=path_for('trained_classifier'),
            cycle_length=current_app.
            config['CLASSIFIER_CLASSIFIER_CYCLE_LENGTH'])
    except IOError as error:
        raise IOError('Unable to save the trained classifier.') from error
def test_preprocess_and_save_data(app, trained_pipeline):
    dataframe = pd.read_pickle(path_for('dataframe'))

    # Test core/preprocessor:split_and_save_data_for_language_model_and_classifier
    classifier_training_csv = pd.read_csv(
        path_for('classifier_data') / 'training_data.csv')
    assert isclose(len(classifier_training_csv),
                   len(dataframe) *
                   (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']),
                   abs_tol=1)
    classifier_validation_csv = pd.read_csv(
        path_for('classifier_data') / 'validation_data.csv')
    assert isclose(len(classifier_validation_csv),
                   len(dataframe) *
                   app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'],
                   abs_tol=1)

    language_model_training_csv = pd.read_csv(
        path_for('language_model_data') / 'training_data.csv')
    assert isclose(len(language_model_training_csv),
                   len(dataframe) *
                   (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']),
                   abs_tol=1)
    language_model_validation_csv = pd.read_csv(
        path_for('language_model_data') / 'validation_data.csv')
    assert isclose(len(language_model_validation_csv),
                   len(dataframe) *
                   app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'],
                   abs_tol=1)

    # Test core/preprocessor:generate_and_save_language_model_tokens
    language_model_training_tokens = np.load(
        path_for('language_model_data') / 'training_tokens.npy')
    assert isclose(len(language_model_training_tokens),
                   len(dataframe) *
                   (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']),
                   abs_tol=1)
    language_model_validation_tokens = np.load(
        path_for('language_model_data') / 'validation_tokens.npy')
    assert isclose(len(language_model_validation_tokens),
                   len(dataframe) *
                   app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'],
                   abs_tol=1)
    language_model_training_labels = np.load(
        path_for('language_model_data') / 'training_labels.npy')
    assert isclose(len(language_model_training_labels),
                   len(dataframe) *
                   (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']),
                   abs_tol=1)
    language_model_validation_labels = np.load(
        path_for('language_model_data') / 'validation_labels.npy')
    assert isclose(len(language_model_validation_labels),
                   len(dataframe) *
                   app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'],
                   abs_tol=1)

    # Test core/preprocessor:map_and_save_tokens_to_ids_for_language_model
    data_itos = pickle.load(open(path_for('data_itos'), 'rb'))
    assert len(
        data_itos) == app.config['CLASSIFIER_MAXIMUM_VOCABULARY_SIZE'] + 2

    language_model_training_ids = np.load(
        path_for('language_model_data') / 'training_token_ids.npy')
    assert isclose(len(language_model_training_ids),
                   len(dataframe) *
                   (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']),
                   abs_tol=1)
    language_model_validation_ids = np.load(
        path_for('language_model_data') / 'validation_token_ids.npy')
    assert isclose(len(language_model_validation_ids),
                   len(dataframe) *
                   app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'],
                   abs_tol=1)

    # Test core/preprocessor:generate_and_save_classifier_tokens
    classifier_training_tokens = np.load(
        path_for('classifier_data') / 'training_tokens.npy')
    assert isclose(len(classifier_training_tokens),
                   len(dataframe) *
                   (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']),
                   abs_tol=1)
    classifier_validation_tokens = np.load(
        path_for('classifier_data') / 'validation_tokens.npy')
    assert isclose(len(classifier_validation_tokens),
                   len(dataframe) *
                   app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'],
                   abs_tol=1)
    classifier_training_labels = np.load(
        path_for('classifier_data') / 'training_labels.npy')
    assert isclose(len(classifier_training_labels),
                   len(dataframe) *
                   (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']),
                   abs_tol=1)
    classifier_validation_labels = np.load(
        path_for('classifier_data') / 'validation_labels.npy')
    assert isclose(len(classifier_validation_labels),
                   len(dataframe) *
                   app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'],
                   abs_tol=1)

    # Test core/preprocessor:map_and_save_tokens_to_ids_for_classifier
    classifier_training_ids = np.load(
        path_for('classifier_data') / 'training_token_ids.npy')
    assert isclose(len(classifier_training_ids),
                   len(dataframe) *
                   (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']),
                   abs_tol=1)
    classifier_validation_ids = np.load(
        path_for('classifier_data') / 'validation_token_ids.npy')
    assert isclose(len(classifier_validation_ids),
                   len(dataframe) *
                   app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'],
                   abs_tol=1)
def test_create_directories(trained_pipeline):
    assert path_for('classifier_data').exists()
    assert path_for('language_model_data').exists()
    assert path_for('classifier_model').exists()
    assert (path_for('language_model') / 'wikitext_103').exists()
def test_train_and_save_classifier(trained_pipeline):
    assert path_for('trained_classifier').exists()
def test_finetune_and_save_language_model(trained_pipeline):
    assert path_for('pretrained_language_model').exists()
    assert path_for('wikitext103_itos').exists()
    assert path_for('finetuned_language_model_encoder').exists()
Esempio n. 11
0
def finetune_and_save_language_model():
    """
    Finetunes the pretrained (on wikitext103) language model on our dataset.
    """
    try:
        language_model = LanguageModel(
            training_data_ids_path=path_for('language_model_data') /
            'training_token_ids.npy',
            validation_data_ids_path=path_for('language_model_data') /
            'validation_token_ids.npy',
            language_model_model_dir=path_for('language_model_data'),
            data_itos_path=path_for('data_itos'),
            cuda_device_id=current_app.config['CLASSIFIER_CUDA_DEVICE_ID'],
            batch_size=current_app.
            config['CLASSIFIER_LANGUAGE_MODEL_BATCH_SIZE'])
    except IOError as error:
        raise IOError(
            'Training files, language model data directory, or data ITOS do not exist.'
        ) from error

    if not path_for('pretrained_language_model').exists():
        wikitext103_language_model_response = requests.get(
            current_app.config['CLASSIFIER_WIKITEXT103_LANGUAGE_MODEL_URL'],
            allow_redirects=True)
        wikitext103_language_model_response.raise_for_status()
        with open(path_for('pretrained_language_model'), 'wb') as fd:
            fd.write(wikitext103_language_model_response.content)
    if not path_for('wikitext103_itos').exists():
        wikitext103_itos_response = requests.get(
            current_app.config['CLASSIFIER_WIKITEXT103_ITOS_URL'],
            allow_redirects=True)
        wikitext103_itos_response.raise_for_status()
        with open(path_for('wikitext103_itos'), 'wb') as fd:
            fd.write(wikitext103_itos_response.content)

    try:
        language_model.load_pretrained_language_model_weights(
            pretrained_language_model_path=path_for(
                'pretrained_language_model'),
            wikitext103_itos_path=path_for('wikitext103_itos'))
    except IOError as error:
        raise IOError(
            'Wikitext103 pretrained language model and Wikitext103 ITOS do not exist.'
        ) from error

    try:
        language_model.train(
            finetuned_language_model_encoder_save_path=path_for(
                'finetuned_language_model_encoder'),
            cycle_length=current_app.
            config['CLASSIFIER_LANGUAGE_MODEL_CYCLE_LENGTH'])
    except IOError as error:
        raise IOError(
            'Unable to save the finetuned language model. Please check that the language model data directory '
            'exists.') from error