def predict_coreness(title, abstract): """ Predicts class-wise probabilities given the title and abstract. """ text = title + ' <ENDTITLE> ' + abstract categories = ['rejected', 'non_core', 'core'] try: classifier = Classifier( data_itos_path=path_for('data_itos'), number_of_classes=3, cuda_device_id=current_app.config['CLASSIFIER_CUDA_DEVICE_ID']) except IOError as error: raise IOError('Data ITOS not found.') from error try: classifier.load_trained_classifier_weights( path_for('trained_classifier')) except IOError as error: raise IOError( 'Could not load the trained classifier weights.') from error class_probabilities = classifier.predict(text) assert len(class_probabilities) == 3 predicted_class = categories[np.argmax(class_probabilities)] output_dict = {'prediction': predicted_class} output_dict['scores'] = dict(zip(categories, class_probabilities)) return output_dict
def test_predict_coreness(trained_pipeline): assert path_for('data_itos').exists() assert path_for('trained_classifier').exists() output_dict = predict_coreness(title=TEST_TITLE, abstract=TEST_ABSTRACT) assert set(output_dict.keys()) == {'prediction', 'scores'} assert output_dict['prediction'] in {'rejected', 'non_core', 'core'} assert set( output_dict['scores'].keys()) == {'rejected', 'non_core', 'core'} assert isclose(output_dict['scores']['rejected'] + output_dict['scores']['non_core'] + output_dict['scores']['core'], 1.0, abs_tol=1e-2)
def trained_pipeline(app, tmp_path_factory): app.config['CLASSIFIER_BASE_PATH'] = tmp_path_factory create_directories() shutil.copy( Path(__file__).parent / 'fixtures' / 'inspire_test_data.df', path_for('dataframe')) train()
def preprocess_and_save_data(): """ Prepares the data for training. """ try: split_and_save_data_for_language_model_and_classifier( dataframe_path=path_for('dataframe'), language_model_data_dir=path_for('language_model_data'), classifier_data_dir=path_for('classifier_data'), val_fraction=current_app. config['CLASSIFIER_VALIDATION_DATA_FRACTION']) except IOError as error: raise IOError( 'Training dataframe not found. Make sure the file is present in the right directory. ' 'Please use the path specified in config.py for CLASSIFIER_DATAFRAME_PATH relative to the ' 'CLASSIFIER_BASE_PATH.') from error try: generate_and_save_language_model_tokens( language_model_data_dir=path_for('language_model_data')) except IOError as error: raise IOError( 'Language Model data directory does not exist.') from error try: map_and_save_tokens_to_ids_for_language_model( language_model_data_dir=path_for('language_model_data'), data_itos_path=path_for('data_itos'), max_vocab_size=current_app. config['CLASSIFIER_MAXIMUM_VOCABULARY_SIZE'], minimum_frequency=current_app. config['CLASSIFIER_MINIMUM_WORD_FREQUENCY']) except IOError as error: raise IOError( 'Language Model data directory or the data directory do not exist.' ) from error try: generate_and_save_classifier_tokens( classifier_data_dir=path_for('classifier_data')) except IOError as error: raise IOError('Classifier data directory does not exist.') from error try: map_and_save_tokens_to_ids_for_classifier( classifier_data_dir=path_for('classifier_data'), data_itos_path=path_for('data_itos')) except IOError as error: raise IOError( 'Classifier data directory or the data ITOS does not exist.' ) from error
def create_directories(): """Create the project data and model directories""" path_for('classifier_data').mkdir(parents=True, exist_ok=True) path_for('language_model_data').mkdir(parents=True, exist_ok=True) path_for('classifier_model').mkdir(parents=True, exist_ok=True) (path_for('language_model') / 'wikitext_103').mkdir(parents=True, exist_ok=True)
def train_and_save_classifier(): """ Trains the classifier on our dataset and save the weights. """ try: classifier = Classifier( data_itos_path=path_for('data_itos'), number_of_classes=3, cuda_device_id=current_app.config['CLASSIFIER_CUDA_DEVICE_ID']) except IOError as error: raise IOError('Data ITOS not found.') from error try: classifier.load_training_and_validation_data( training_data_ids_path=path_for('classifier_data') / 'training_token_ids.npy', training_data_labels_path=path_for('classifier_data') / 'training_labels.npy', validation_data_ids_path=path_for('classifier_data') / 'validation_token_ids.npy', validation_data_labels_path=path_for('classifier_data') / 'validation_labels.npy', classifier_data_dir=path_for('classifier_data'), batch_size=current_app.config['CLASSIFIER_CLASSIFIER_BATCH_SIZE']) except IOError as error: raise IOError('Training and Validation data for Classifier not found.' ) from error classifier.initialize_learner() try: classifier.load_finetuned_language_model_weights( finetuned_language_model_encoder_path=path_for( 'finetuned_language_model_encoder')) except IOError as error: raise IOError( 'Finetuned Language Model Encoder does not exist.') from error try: classifier.train( trained_classifier_save_path=path_for('trained_classifier'), cycle_length=current_app. config['CLASSIFIER_CLASSIFIER_CYCLE_LENGTH']) except IOError as error: raise IOError('Unable to save the trained classifier.') from error
def test_preprocess_and_save_data(app, trained_pipeline): dataframe = pd.read_pickle(path_for('dataframe')) # Test core/preprocessor:split_and_save_data_for_language_model_and_classifier classifier_training_csv = pd.read_csv( path_for('classifier_data') / 'training_data.csv') assert isclose(len(classifier_training_csv), len(dataframe) * (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']), abs_tol=1) classifier_validation_csv = pd.read_csv( path_for('classifier_data') / 'validation_data.csv') assert isclose(len(classifier_validation_csv), len(dataframe) * app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'], abs_tol=1) language_model_training_csv = pd.read_csv( path_for('language_model_data') / 'training_data.csv') assert isclose(len(language_model_training_csv), len(dataframe) * (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']), abs_tol=1) language_model_validation_csv = pd.read_csv( path_for('language_model_data') / 'validation_data.csv') assert isclose(len(language_model_validation_csv), len(dataframe) * app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'], abs_tol=1) # Test core/preprocessor:generate_and_save_language_model_tokens language_model_training_tokens = np.load( path_for('language_model_data') / 'training_tokens.npy') assert isclose(len(language_model_training_tokens), len(dataframe) * (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']), abs_tol=1) language_model_validation_tokens = np.load( path_for('language_model_data') / 'validation_tokens.npy') assert isclose(len(language_model_validation_tokens), len(dataframe) * app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'], abs_tol=1) language_model_training_labels = np.load( path_for('language_model_data') / 'training_labels.npy') assert isclose(len(language_model_training_labels), len(dataframe) * (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']), abs_tol=1) language_model_validation_labels = np.load( path_for('language_model_data') / 'validation_labels.npy') assert isclose(len(language_model_validation_labels), len(dataframe) * app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'], abs_tol=1) # Test core/preprocessor:map_and_save_tokens_to_ids_for_language_model data_itos = pickle.load(open(path_for('data_itos'), 'rb')) assert len( data_itos) == app.config['CLASSIFIER_MAXIMUM_VOCABULARY_SIZE'] + 2 language_model_training_ids = np.load( path_for('language_model_data') / 'training_token_ids.npy') assert isclose(len(language_model_training_ids), len(dataframe) * (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']), abs_tol=1) language_model_validation_ids = np.load( path_for('language_model_data') / 'validation_token_ids.npy') assert isclose(len(language_model_validation_ids), len(dataframe) * app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'], abs_tol=1) # Test core/preprocessor:generate_and_save_classifier_tokens classifier_training_tokens = np.load( path_for('classifier_data') / 'training_tokens.npy') assert isclose(len(classifier_training_tokens), len(dataframe) * (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']), abs_tol=1) classifier_validation_tokens = np.load( path_for('classifier_data') / 'validation_tokens.npy') assert isclose(len(classifier_validation_tokens), len(dataframe) * app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'], abs_tol=1) classifier_training_labels = np.load( path_for('classifier_data') / 'training_labels.npy') assert isclose(len(classifier_training_labels), len(dataframe) * (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']), abs_tol=1) classifier_validation_labels = np.load( path_for('classifier_data') / 'validation_labels.npy') assert isclose(len(classifier_validation_labels), len(dataframe) * app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'], abs_tol=1) # Test core/preprocessor:map_and_save_tokens_to_ids_for_classifier classifier_training_ids = np.load( path_for('classifier_data') / 'training_token_ids.npy') assert isclose(len(classifier_training_ids), len(dataframe) * (1 - app.config['CLASSIFIER_VALIDATION_DATA_FRACTION']), abs_tol=1) classifier_validation_ids = np.load( path_for('classifier_data') / 'validation_token_ids.npy') assert isclose(len(classifier_validation_ids), len(dataframe) * app.config['CLASSIFIER_VALIDATION_DATA_FRACTION'], abs_tol=1)
def test_create_directories(trained_pipeline): assert path_for('classifier_data').exists() assert path_for('language_model_data').exists() assert path_for('classifier_model').exists() assert (path_for('language_model') / 'wikitext_103').exists()
def test_train_and_save_classifier(trained_pipeline): assert path_for('trained_classifier').exists()
def test_finetune_and_save_language_model(trained_pipeline): assert path_for('pretrained_language_model').exists() assert path_for('wikitext103_itos').exists() assert path_for('finetuned_language_model_encoder').exists()
def finetune_and_save_language_model(): """ Finetunes the pretrained (on wikitext103) language model on our dataset. """ try: language_model = LanguageModel( training_data_ids_path=path_for('language_model_data') / 'training_token_ids.npy', validation_data_ids_path=path_for('language_model_data') / 'validation_token_ids.npy', language_model_model_dir=path_for('language_model_data'), data_itos_path=path_for('data_itos'), cuda_device_id=current_app.config['CLASSIFIER_CUDA_DEVICE_ID'], batch_size=current_app. config['CLASSIFIER_LANGUAGE_MODEL_BATCH_SIZE']) except IOError as error: raise IOError( 'Training files, language model data directory, or data ITOS do not exist.' ) from error if not path_for('pretrained_language_model').exists(): wikitext103_language_model_response = requests.get( current_app.config['CLASSIFIER_WIKITEXT103_LANGUAGE_MODEL_URL'], allow_redirects=True) wikitext103_language_model_response.raise_for_status() with open(path_for('pretrained_language_model'), 'wb') as fd: fd.write(wikitext103_language_model_response.content) if not path_for('wikitext103_itos').exists(): wikitext103_itos_response = requests.get( current_app.config['CLASSIFIER_WIKITEXT103_ITOS_URL'], allow_redirects=True) wikitext103_itos_response.raise_for_status() with open(path_for('wikitext103_itos'), 'wb') as fd: fd.write(wikitext103_itos_response.content) try: language_model.load_pretrained_language_model_weights( pretrained_language_model_path=path_for( 'pretrained_language_model'), wikitext103_itos_path=path_for('wikitext103_itos')) except IOError as error: raise IOError( 'Wikitext103 pretrained language model and Wikitext103 ITOS do not exist.' ) from error try: language_model.train( finetuned_language_model_encoder_save_path=path_for( 'finetuned_language_model_encoder'), cycle_length=current_app. config['CLASSIFIER_LANGUAGE_MODEL_CYCLE_LENGTH']) except IOError as error: raise IOError( 'Unable to save the finetuned language model. Please check that the language model data directory ' 'exists.') from error