def extract(): """ Extract necessary data / resources from upstream. This method will: - Validate that newsgroup data set is available, and read in - Validate that text embeddings are available, and read in - Validate that text to embedding index lookup is available, and read in :return: observations, embedding_matrix, word_to_index :rtype: (pandas.DataFrame, numpy.array, dict) """ logging.info('Begin extract') logging.info('Performing extract for batch: {}, from newgroup_path: {}' .format(lib.get_batch_name(), lib.get_conf('newsgroup_path'))) # Download resources # Confirm newsgroup data set is downloaded resources.download_newsgroup() # Confirm that embedding is downloaded resources.download_embedding() # Extract resources from file system # Newsgroup20: Get list of all candidate documents glob_pattern = os.path.join(lib.get_conf('newsgroup_path'), '*', '*') logging.info('Searching for glob_pattern: {}'.format(glob_pattern)) document_candidates = glob.glob(glob_pattern) # Newsgroup20: Create observations data set observations = pandas.DataFrame(document_candidates, columns=['document_path']) logging.info('Shape of observations data frame created from glob matches: {}'.format(observations.shape)) # Newsgroup20: Re-order rows observations = observations.sample(frac=1) # Newsgroup20: Subset number of observations, if it's a test run if lib.get_conf('test_run'): logging.info('Reducing file size for test run') observations = observations.head(100) logging.info('Test run number of records: {}'.format(len(observations.index))) # Embedding: Load embedding embedding_matrix, word_to_index = resources.create_embedding_matrix() logging.info('word_to_index max index: {}'.format(max(word_to_index.values()))) # Archive schema and return lib.archive_dataset_schemas('extract', locals(), globals()) logging.info('End extract') return observations, embedding_matrix, word_to_index
def extract(): # TODO Docstring logging.info('Begin extract') # Extract all posts for given subreddit, going back given number of days logging.info('Downloading submissions from Reddit') observations = scrape_subreddit(lib.get_conf('subreddit'), lib.get_conf('history_num_days')) logging.info('Found {} submissions'.format(len(observations.index))) # Load embedding matrix resources.download_embedding() embedding_matrix, word_to_index = resources.create_embedding_matrix() logging.info('word_to_index max index: {}'.format( max(word_to_index.values()))) logging.info('End extract') lib.archive_dataset_schemas('extract', locals(), globals()) return embedding_matrix, word_to_index, observations
def transform(): """ Transforms all features for the 3 models. Image - Convert jpegs to numpy arrays and preprocess for the vgg16 model Audio - Use librosa to extract features and save dataframe with all features for each video Text - Tokenize, and convert to indices based on the google news 20 word embeddings :return: """ embedding_matrix, word_to_index = resources.create_embedding_matrix() for partition in ['training', 'test', 'validation']: # Transform raw jpegs into numpy arrays lib.transform_images(partition=partition, num_frames=10) # Transform raw audio to feature matrix lib.transform_audio(partition=partition, n_mfcc=13) # Transform text to tokens lib.transform_text(partition=partition, word_to_index=word_to_index) pass
def model(image=False, audio=False, text=False): """ Train all 3 models :param image: Whether or not to train the image model on this run :param audio: Whether or not to train the audio model on this run :param text: Whether or not to train the text model on this run :return: """ if image: # Parameters params = { 'dim': (10, 224, 224), 'batch_size': 16, 'n_channels': 3, 'shuffle': True } # Load labels set with open('../data/image_data/pickle_files/y_5d_training.pkl', 'rb') as file: training_labels = pickle.load(file) with open('../data/image_data/pickle_files/y_5d_test.pkl', 'rb') as file: test_labels = pickle.load(file) # Generators training_generator = DataGenerator(partition='training', list_IDs=range(6000), labels=training_labels, **params) validation_generator = DataGenerator(partition='test', list_IDs=range(2000), labels=test_labels, **params) # Create model model = models.image_lrcn() # Train model on data set model.fit_generator(generator=training_generator, validation_data=validation_generator, use_multiprocessing=True, workers=6, epochs=5) model.save_weights('../output/image_model.h5') if audio: # Read in aduio data training_set = pd.read_csv( '../data/audio_data/pickle_files/training_df.csv') test_set = pd.read_csv('../data/audio_data/pickle_files/test_df.csv') # Concat data sets in order to use all data for CV all_data = pd.concat((training_set, test_set), axis=0) X_all = all_data.drop(['interview_score', 'video_id'], axis=1) y_all = all_data['interview_score'] logging.info('Start training audio model') # Create model and fit to data audio_model = models.audio_rand_forest() audio_model.fit(X_all, y_all) logging.info(audio_model.best_params_) logging.info('Train score with best estimator: {}'.format( max(audio_model.cv_results_['mean_train_score']))) logging.info('Test score with best estimator: {}'.format( max(audio_model.cv_results_['mean_test_score']))) # Save to disk with open('../output/audio_model.pkl', 'wb') as fid: pickle.dump(audio_model, fid) if text: # Load in word embeddings embedding_matrix, word_to_index = resources.create_embedding_matrix() # Load text data with open('../data/text_data/pickle_files/X_training.pkl', 'rb') as file: X_train = pickle.load(file) with open('../data/text_data/pickle_files/y_training.pkl', 'rb') as file: y_train = pickle.load(file) with open('../data/text_data/pickle_files/X_test.pkl', 'rb') as file: X_test = pickle.load(file) with open('../data/text_data/pickle_files/y_test.pkl', 'rb') as file: y_test = pickle.load(file) # Create model object and fit text_model = models.text_lstm_model(embedding_matrix=embedding_matrix) filename = '../output/text_model.h5' checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min') text_model.fit(X_train, y_train, batch_size=32, epochs=55, validation_data=(X_test, y_test), callbacks=[checkpoint], shuffle=True) pass
def score_new_vid(): logging.info('Begin extraction for scoring partition') # Extract features from vids lib.extract_images(partition='score', num_frames=10) lib.extract_audio(partition='score') lib.extract_text(partition='score', training=False) logging.info('Begin transformation for scoring partition') # Transform features embedding_matrix, word_to_index = resources.create_embedding_matrix() lib.transform_images(partition='score', num_frames=10, training=False) lib.transform_audio(partition='score', n_mfcc=13, training=False) lib.transform_text(partition='score', word_to_index=word_to_index, training=False) logging.info('Load models for evaluation of the scoring partition') # Load models image_model = models.image_lrcn() image_model.load_weights('../output/image_model.h5') audio_model = pickle.load(open('../output/audio_model.pkl', 'rb')) text_model = load_model('../output/text_model.h5') ensemble_model = pickle.load(open('../output/ensemble_model.pkl', 'rb')) logging.info('Load transformed data') # Load image data with open('../data/image_data/pickle_files/vid_ids_5d_score.pkl', 'rb') as file: id_img_score = pickle.load(file) # Load audio data aud_to_score = pd.read_csv('../data/audio_data/pickle_files/score_df.csv') X_aud_score = aud_to_score.drop(['video_id'], axis=1) id_aud_score = aud_to_score['video_id'] # Load text data with open('../data/text_data/pickle_files/X_score.pkl', 'rb') as file: X_text_score = pickle.load(file) with open('../data/text_data/pickle_files/vid_ids_score.pkl', 'rb') as file: id_text_score = pickle.load(file) # Load generator score_generator = DataGenerator( partition='training', list_IDs=range(len(id_aud_score)), labels=[0 for i in range(len(id_aud_score))], batch_size=len(id_aud_score), n_channels=3, dim=(10, 224, 224), shuffle=False) logging.info('Predict values with image, text and audio models') # Predict values img_score_df = pd.DataFrame({ 'img_preds': [i[0] for i in image_model.predict_generator(score_generator)], 'video_ids': id_img_score }) aud_score_df = pd.DataFrame({ 'aud_preds': audio_model.predict(X_aud_score), 'video_ids': id_aud_score }) text_score_df = pd.DataFrame({ 'text_preds': [i[0] for i in text_model.predict(X_text_score)], 'video_ids': id_text_score }) logging.info('Make final predictions') # Merge predictions score_preds = img_score_df.merge(aud_score_df, on='video_ids') score_preds = score_preds.merge(text_score_df, on='video_ids') # Make final prediction X_score = score_preds[['img_preds', 'aud_preds', 'text_preds']] score_preds['final_prediction'] = ensemble_model.predict(X_score) # Save predictions to disk score_preds.to_csv('../output/predictions.csv', index=False) pass