Example #1
0
def extract():
    """
    Extract necessary data / resources from upstream. This method will:

     - Validate that newsgroup data set is available, and read in
     - Validate that text embeddings are available, and read in
     - Validate that text to embedding index lookup is available, and read in


    :return: observations, embedding_matrix, word_to_index
    :rtype: (pandas.DataFrame, numpy.array, dict)
    """

    logging.info('Begin extract')
    logging.info('Performing extract for batch: {}, from newgroup_path: {}'
                 .format(lib.get_batch_name(), lib.get_conf('newsgroup_path')))

    # Download resources

    # Confirm newsgroup data set is downloaded
    resources.download_newsgroup()

    # Confirm that embedding is downloaded
    resources.download_embedding()

    # Extract resources from file system

    # Newsgroup20: Get list of all candidate documents
    glob_pattern = os.path.join(lib.get_conf('newsgroup_path'), '*', '*')
    logging.info('Searching for glob_pattern: {}'.format(glob_pattern))
    document_candidates = glob.glob(glob_pattern)

    # Newsgroup20: Create observations data set
    observations = pandas.DataFrame(document_candidates, columns=['document_path'])
    logging.info('Shape of observations data frame created from glob matches: {}'.format(observations.shape))

    # Newsgroup20: Re-order rows
    observations = observations.sample(frac=1)

    # Newsgroup20: Subset number of observations, if it's a test run
    if lib.get_conf('test_run'):
        logging.info('Reducing file size for test run')
        observations = observations.head(100)
        logging.info('Test run number of records: {}'.format(len(observations.index)))

    # Embedding: Load embedding
    embedding_matrix, word_to_index = resources.create_embedding_matrix()
    logging.info('word_to_index max index: {}'.format(max(word_to_index.values())))

    # Archive schema and return
    lib.archive_dataset_schemas('extract', locals(), globals())
    logging.info('End extract')
    return observations, embedding_matrix, word_to_index
Example #2
0
def extract():
    # TODO Docstring

    logging.info('Begin extract')

    # Extract all posts for given subreddit, going back given number of days
    logging.info('Downloading submissions from Reddit')
    observations = scrape_subreddit(lib.get_conf('subreddit'),
                                    lib.get_conf('history_num_days'))
    logging.info('Found {} submissions'.format(len(observations.index)))

    # Load embedding matrix
    resources.download_embedding()
    embedding_matrix, word_to_index = resources.create_embedding_matrix()
    logging.info('word_to_index max index: {}'.format(
        max(word_to_index.values())))

    logging.info('End extract')
    lib.archive_dataset_schemas('extract', locals(), globals())
    return embedding_matrix, word_to_index, observations
Example #3
0
def transform():
    """
    Transforms all features for the 3 models.
    Image - Convert jpegs to numpy arrays and preprocess for the vgg16 model
    Audio - Use librosa to extract features and save dataframe with all features for each video
    Text - Tokenize, and convert to indices based on the google news 20 word embeddings
    :return:
    """

    embedding_matrix, word_to_index = resources.create_embedding_matrix()

    for partition in ['training', 'test', 'validation']:

        # Transform raw jpegs into numpy arrays
        lib.transform_images(partition=partition, num_frames=10)

        # Transform raw audio to feature matrix
        lib.transform_audio(partition=partition, n_mfcc=13)

        # Transform text to tokens
        lib.transform_text(partition=partition, word_to_index=word_to_index)

    pass
Example #4
0
def model(image=False, audio=False, text=False):
    """
    Train all 3 models
    :param image: Whether or not to train the image model on this run
    :param audio: Whether or not to train the audio model on this run
    :param text: Whether or not to train the text model on this run
    :return:
    """

    if image:

        # Parameters
        params = {
            'dim': (10, 224, 224),
            'batch_size': 16,
            'n_channels': 3,
            'shuffle': True
        }

        # Load labels set
        with open('../data/image_data/pickle_files/y_5d_training.pkl',
                  'rb') as file:
            training_labels = pickle.load(file)
        with open('../data/image_data/pickle_files/y_5d_test.pkl',
                  'rb') as file:
            test_labels = pickle.load(file)

        # Generators
        training_generator = DataGenerator(partition='training',
                                           list_IDs=range(6000),
                                           labels=training_labels,
                                           **params)
        validation_generator = DataGenerator(partition='test',
                                             list_IDs=range(2000),
                                             labels=test_labels,
                                             **params)

        # Create model
        model = models.image_lrcn()

        # Train model on data set
        model.fit_generator(generator=training_generator,
                            validation_data=validation_generator,
                            use_multiprocessing=True,
                            workers=6,
                            epochs=5)

        model.save_weights('../output/image_model.h5')

    if audio:

        # Read in aduio data
        training_set = pd.read_csv(
            '../data/audio_data/pickle_files/training_df.csv')
        test_set = pd.read_csv('../data/audio_data/pickle_files/test_df.csv')

        # Concat data sets in order to use all data for CV
        all_data = pd.concat((training_set, test_set), axis=0)
        X_all = all_data.drop(['interview_score', 'video_id'], axis=1)
        y_all = all_data['interview_score']

        logging.info('Start training audio model')

        # Create model and fit to data
        audio_model = models.audio_rand_forest()
        audio_model.fit(X_all, y_all)

        logging.info(audio_model.best_params_)
        logging.info('Train score with best estimator: {}'.format(
            max(audio_model.cv_results_['mean_train_score'])))
        logging.info('Test score with best estimator: {}'.format(
            max(audio_model.cv_results_['mean_test_score'])))

        # Save to disk
        with open('../output/audio_model.pkl', 'wb') as fid:
            pickle.dump(audio_model, fid)

    if text:

        # Load in word embeddings
        embedding_matrix, word_to_index = resources.create_embedding_matrix()

        # Load text data
        with open('../data/text_data/pickle_files/X_training.pkl',
                  'rb') as file:
            X_train = pickle.load(file)
        with open('../data/text_data/pickle_files/y_training.pkl',
                  'rb') as file:
            y_train = pickle.load(file)
        with open('../data/text_data/pickle_files/X_test.pkl', 'rb') as file:
            X_test = pickle.load(file)
        with open('../data/text_data/pickle_files/y_test.pkl', 'rb') as file:
            y_test = pickle.load(file)

        # Create model object and fit
        text_model = models.text_lstm_model(embedding_matrix=embedding_matrix)
        filename = '../output/text_model.h5'
        checkpoint = ModelCheckpoint(filename,
                                     monitor='val_loss',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='min')
        text_model.fit(X_train,
                       y_train,
                       batch_size=32,
                       epochs=55,
                       validation_data=(X_test, y_test),
                       callbacks=[checkpoint],
                       shuffle=True)

    pass
Example #5
0
def score_new_vid():

    logging.info('Begin extraction for scoring partition')

    # Extract features from vids
    lib.extract_images(partition='score', num_frames=10)
    lib.extract_audio(partition='score')
    lib.extract_text(partition='score', training=False)

    logging.info('Begin transformation for scoring partition')

    # Transform features
    embedding_matrix, word_to_index = resources.create_embedding_matrix()
    lib.transform_images(partition='score', num_frames=10, training=False)
    lib.transform_audio(partition='score', n_mfcc=13, training=False)
    lib.transform_text(partition='score',
                       word_to_index=word_to_index,
                       training=False)

    logging.info('Load models for evaluation of the scoring partition')

    # Load models
    image_model = models.image_lrcn()
    image_model.load_weights('../output/image_model.h5')
    audio_model = pickle.load(open('../output/audio_model.pkl', 'rb'))
    text_model = load_model('../output/text_model.h5')
    ensemble_model = pickle.load(open('../output/ensemble_model.pkl', 'rb'))

    logging.info('Load transformed data')

    # Load image data
    with open('../data/image_data/pickle_files/vid_ids_5d_score.pkl',
              'rb') as file:
        id_img_score = pickle.load(file)

    # Load audio data
    aud_to_score = pd.read_csv('../data/audio_data/pickle_files/score_df.csv')
    X_aud_score = aud_to_score.drop(['video_id'], axis=1)
    id_aud_score = aud_to_score['video_id']

    # Load text data
    with open('../data/text_data/pickle_files/X_score.pkl', 'rb') as file:
        X_text_score = pickle.load(file)
    with open('../data/text_data/pickle_files/vid_ids_score.pkl',
              'rb') as file:
        id_text_score = pickle.load(file)

    # Load generator
    score_generator = DataGenerator(
        partition='training',
        list_IDs=range(len(id_aud_score)),
        labels=[0 for i in range(len(id_aud_score))],
        batch_size=len(id_aud_score),
        n_channels=3,
        dim=(10, 224, 224),
        shuffle=False)

    logging.info('Predict values with image, text and audio models')

    # Predict values
    img_score_df = pd.DataFrame({
        'img_preds':
        [i[0] for i in image_model.predict_generator(score_generator)],
        'video_ids':
        id_img_score
    })
    aud_score_df = pd.DataFrame({
        'aud_preds': audio_model.predict(X_aud_score),
        'video_ids': id_aud_score
    })
    text_score_df = pd.DataFrame({
        'text_preds': [i[0] for i in text_model.predict(X_text_score)],
        'video_ids':
        id_text_score
    })

    logging.info('Make final predictions')

    # Merge predictions
    score_preds = img_score_df.merge(aud_score_df, on='video_ids')
    score_preds = score_preds.merge(text_score_df, on='video_ids')

    # Make final prediction
    X_score = score_preds[['img_preds', 'aud_preds', 'text_preds']]
    score_preds['final_prediction'] = ensemble_model.predict(X_score)

    # Save predictions to disk
    score_preds.to_csv('../output/predictions.csv', index=False)

    pass