def main(train_model_window=None):

    ti = time.monotonic()
    # dict to save dataframe metadata which would be later merged in model_metadata dataframe.
    metadata = {}
    # "updated" should always be set to False in this script.
    metadata['updated'] = False
    try:
        listenbrainz_spark.init_spark_session('Create Dataframes')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    current_app.logger.info('Fetching listens to create dataframes...')
    to_date, from_date = get_dates_to_train_data(train_model_window)
    partial_listens_df = get_listens_for_training_model_window(
        to_date, from_date, metadata, path.LISTENBRAINZ_DATA_DIRECTORY)
    current_app.logger.info(
        'Listen count from {from_date} to {to_date}: {listens_count}'.format(
            from_date=from_date,
            to_date=to_date,
            listens_count=partial_listens_df.count()))

    current_app.logger.info('Loading mapping from HDFS...')
    df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING)
    msid_mbid_mapping_df = get_unique_rows_from_mapping(df)
    current_app.logger.info(
        'Number of distinct rows in the mapping: {}'.format(
            msid_mbid_mapping_df.count()))

    current_app.logger.info('Mapping listens...')
    mapped_listens_df = get_mapped_artist_and_recording_mbids(
        partial_listens_df, msid_mbid_mapping_df)
    current_app.logger.info('Listen count after mapping: {}'.format(
        mapped_listens_df.count()))

    current_app.logger.info('Preparing users data and saving to HDFS...')
    users_df = get_users_dataframe(mapped_listens_df, metadata)

    current_app.logger.info('Preparing recordings data and saving to HDFS...')
    recordings_df = get_recordings_df(mapped_listens_df, metadata)

    current_app.logger.info(
        'Preparing listen data dump and playcounts, saving playcounts to HDFS...'
    )
    listens_df = get_listens_df(mapped_listens_df, metadata)

    save_playcounts_df(listens_df, recordings_df, users_df, metadata)

    generate_dataframe_id(metadata)
    save_dataframe_metadata_to_hdfs(metadata)

    current_app.logger.info('Preparing missing MusicBrainz data...')
    missing_musicbrainz_data_itr = get_data_missing_from_musicbrainz(
        partial_listens_df, msid_mbid_mapping_df)

    messages = prepare_messages(missing_musicbrainz_data_itr, from_date,
                                to_date, ti)

    return messages
Ejemplo n.º 2
0
 def __init__(self):
     hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
     try:
         listenbrainz_spark.init_spark_session('uploader')
     except SparkSessionNotInitializedException as err:
         logger.error(str(err), exc_info=True)
         sys.exit(-1)
Ejemplo n.º 3
0
def init_dir(rm, recursive, create_dir):
    """ Create directories in HDFS to run the recommendation engine.
    """
    try:
        listenbrainz_spark.init_spark_session('Manage Directories')
    except Py4JJavaError as err:
        logging.error('{}\n{}\nAborting...'.format(str(err),
                                                   err.java_exception))
        sys.exit(-1)

    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    if rm:
        try:
            utils.delete_dir(path.RECOMMENDATION_PARENT_DIR)
            utils.delete_dir(path.CHECKPOINT_DIR)
            logging.info('Successfully deleted directories.')
        except HdfsError as err:
            logging.error(
                '{}: Some/all directories are non-empty. Try "--recursive" to delete recursively.'
                .format(type(err).__name__))
            logging.warning(
                'Deleting directory recursively will delete all the recommendation data.'
            )
            sys.exit(-1)

    if recursive:
        try:
            utils.delete_dir(path.RECOMMENDATION_PARENT_DIR, recursive=True)
            utils.delete_dir(path.CHECKPOINT_DIR, recursive=True)
            logging.info('Successfully deleted directories recursively.')
        except HdfsError as err:
            logging.error(
                '{}: An error occurred while deleting directories recursively.\n{}\nAborting...'
                .format(type(err).__name__, str(err)))
            sys.exit(-1)

    if create_dir:
        try:
            logging.info('Creating directory to store dataframes...')
            utils.create_dir(path.DATAFRAME_DIR)

            logging.info('Creating directory to store models...')
            utils.create_dir(path.MODEL_DIR)

            logging.info('Creating directory to store candidate sets...')
            utils.create_dir(path.CANDIDATE_SET_DIR)

            logging.info('Creating directory to store RDD checkpoints...')
            utils.create_dir(path.CHECKPOINT_DIR)

            print('Done!')
        except HdfsError as err:
            logging.error(
                '{}: An error occured while creating some/more directories.\n{}\nAborting...'
                .format(type(err).__name__, str(err)))
            sys.exit(-1)
Ejemplo n.º 4
0
def main():
    listenbrainz_spark.init_spark_session('artist_popularity')
    mlhd_df_path = config.HDFS_CLUSTER_URI + os.path.join(
        MLHD_DATA_PATH, '*.avro')
    try:
        print('Loading MLHD Dataframe...')
        mlhd_df = listenbrainz_spark.sql_context.read.format('avro').load(
            mlhd_df_path)
        print("Loaded!")
    except AnalysisException as e:
        logger.critical("Error while reading MLHD avro files: %s", str(e))
        raise

    print("Number of rows: %d" % mlhd_df.count())
    try:
        mlhd_df.registerTempTable('mlhd')
    except AnalysisException as e:
        logger.critical("Error while registering dataframe mlhd: %s", str(e))
        raise

    for _ in range(5):
        try:
            print("Running SQL...")
            artist_popularity_df = listenbrainz_spark.sql_context.sql("""
                    SELECT artist_mbid, COUNT(artist_mbid) as cnt
                      FROM mlhd
                  GROUP BY artist_mbid
                  ORDER BY cnt DESC
            """)
            break
        except Py4JJavaError as e:
            logger.error("error while running the query: %s", str(e))
    else:
        logger.critical("Could not run query. Exiting...")
        sys.exit(-1)

    print("number of rows: ", artist_popularity_df.count())
    artist_popularity_df.show()
    print("Saving...")
    file_name = 'mlhd-artist-popularity-%s.csv' % datetime.now.strftime(
        '%Y%m%d-%H%M%S')
    csv_path = config.HDFS_CLUSTER_URI + os.path.join(MLHD_DATA_PATH, 'csv',
                                                      file_name)
    for _ in range(10):
        try:
            artist_popularity_df.write.csv(csv_path)
            break
        except Exception as e:
            logger.error(
                "Couldn't write result to CSV, trying again, error: %s",
                str(e))
    else:
        logger.critical("Could not write results to HDFS, exiting...")
        sys.exit(-1)

    print("Saved to %s!" % csv_path)
Ejemplo n.º 5
0
def main():
    ti = time()
    time_ = defaultdict(dict)
    try:
        listenbrainz_spark.init_spark_session('Recommendations')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    try:
        users_df = utils.read_files_from_HDFS(path.USERS_DATAFRAME_PATH)
        recordings_df = utils.read_files_from_HDFS(path.RECORDINGS_DATAFRAME_PATH)

        top_artists_candidate_set = utils.read_files_from_HDFS(path.TOP_ARTIST_CANDIDATE_SET)
        similar_artists_candidate_set = utils.read_files_from_HDFS(path.SIMILAR_ARTIST_CANDIDATE_SET)
        mapped_listens = utils.read_files_from_HDFS(path.MAPPED_LISTENS)
    except PathNotFoundException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    metadata_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json')
    with open(metadata_file_path, 'r') as f:
        recommendation_metadata = json.load(f)
        best_model_id = recommendation_metadata['best_model_id']
        user_names = recommendation_metadata['user_name']

    best_model_path = path.DATA_DIR + '/' + best_model_id

    current_app.logger.info('Loading model...')
    t0 = time()
    try:
        model = load_model(config.HDFS_CLUSTER_URI + best_model_path)
    except Py4JJavaError as err:
        current_app.logger.error('Unable to load model "{}"\n{}\nAborting...'.format(best_model_id, str(err.java_exception)),
            exc_info=True)
        sys.exit(-1)
    time_['load_model'] = '{:.2f}'.format((time() - t0) / 60)

    # an action must be called to persist data in memory
    recordings_df.count()
    recordings_df.persist()

    t0 = time()
    recommendations = get_recommendations(user_names, recordings_df, model, users_df, top_artists_candidate_set,
        similar_artists_candidate_set, mapped_listens)
    time_['total_recommendation_time'] = '{:.2f}'.format((time() - t0) / 3600)

    # persisted data must be cleared from memory after usage to avoid OOM
    recordings_df.unpersist()

    if SAVE_RECOMMENDATION_HTML:
        get_recommendation_html(recommendations, time_, best_model_id, ti)
Ejemplo n.º 6
0
def main():
    ti = time()
    # dict to save dataframe metadata which would be later merged in model_metadata dataframe.
    metadata = {}
    # "updated" should always be set to False in this script.
    metadata['updated'] = False
    try:
        listenbrainz_spark.init_spark_session('Create Dataframes')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    # Dataframe containing all columns except artist_mbids and recording_mbid
    to_date, from_date = get_dates_to_train_data()
    partial_listens_df = get_listens_for_training_model_window(
        to_date, from_date, metadata, path.LISTENBRAINZ_DATA_DIRECTORY)

    # Dataframe containing recording msid->mbid and artist msid->mbid mapping.
    recording_artist_mapping_df = utils.read_files_from_HDFS(
        path.MBID_MSID_MAPPING)

    # Dataframe containing all fields that a listen should have including artist_mbids and recording_msid.
    complete_listens_df = get_mapped_artist_and_recording_mbids(
        partial_listens_df, recording_artist_mapping_df)

    current_app.logger.info('Preparing users data and saving to HDFS...')
    t0 = time()
    users_df = get_users_dataframe(complete_listens_df, metadata)
    users_df_time = '{:.2f}'.format((time() - t0) / 60)

    current_app.logger.info('Preparing recordings data and saving to HDFS...')
    t0 = time()
    recordings_df = get_recordings_df(complete_listens_df, metadata)
    recordings_df_time = '{:.2f}'.format((time() - t0) / 60)

    current_app.logger.info(
        'Preparing listen data dump and playcounts, saving playcounts to HDFS...'
    )
    t0 = time()
    listens_df = get_listens_df(complete_listens_df, metadata)

    playcounts_df = get_playcounts_df(listens_df, recordings_df, users_df,
                                      metadata)
    playcounts_df_time = '{:.2f}'.format((time() - t0) / 60)
    total_time = '{:.2f}'.format((time() - ti) / 60)

    generate_best_model_id(metadata)
    save_dataframe_metadata_to_HDFS(metadata)

    if SAVE_DATAFRAME_HTML:
        save_dataframe_html(users_df_time, recordings_df_time,
                            playcounts_df_time, total_time)
Ejemplo n.º 7
0
def main(train_model_window=None):

    ti = time.monotonic()
    # dict to save dataframe metadata which would be later merged in model_metadata dataframe.
    metadata = {}
    # "updated" should always be set to False in this script.
    metadata['updated'] = False
    try:
        listenbrainz_spark.init_spark_session('Create Dataframes')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    to_date, from_date = get_dates_to_train_data(train_model_window)
    partial_listens_df = get_listens_for_training_model_window(
        to_date, from_date, metadata, path.LISTENBRAINZ_DATA_DIRECTORY)

    # Dataframe containing recording msid->mbid and artist msid->mbid mapping.
    msid_mbid_mapping_df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING)

    mapped_listens_df = get_mapped_artist_and_recording_mbids(
        partial_listens_df, msid_mbid_mapping_df)

    current_app.logger.info('Preparing users data and saving to HDFS...')
    users_df = get_users_dataframe(mapped_listens_df, metadata)

    current_app.logger.info('Preparing recordings data and saving to HDFS...')
    recordings_df = get_recordings_df(mapped_listens_df, metadata)

    current_app.logger.info(
        'Preparing listen data dump and playcounts, saving playcounts to HDFS...'
    )
    listens_df = get_listens_df(mapped_listens_df, metadata)

    playcounts_df = get_playcounts_df(listens_df, recordings_df, users_df,
                                      metadata)

    generate_dataframe_id(metadata)
    save_dataframe_metadata_to_hdfs(metadata)
    total_time = '{:.2f}'.format((time.monotonic() - ti) / 60)

    message = [{
        'type': 'cf_recording_dataframes',
        'dataframe_upload_time': str(datetime.utcnow()),
        'total_time': total_time,
        'from_date': str(from_date.strftime('%b %Y')),
        'to_date': str(to_date.strftime('%b %Y')),
    }]

    return message
Ejemplo n.º 8
0
def main(max_num_users: int):

    logger.info('Start generating similar user matrix')
    try:
        listenbrainz_spark.init_spark_session('User Similarity')
    except SparkSessionNotInitializedException as err:
        logger.error(str(err), exc_info=True)
        raise

    try:
        playcounts_df = utils.read_files_from_HDFS(
            path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME)
        users_df = utils.read_files_from_HDFS(
            path.USER_SIMILARITY_USERS_DATAFRAME)
    except PathNotFoundException as err:
        logger.error(str(err), exc_info=True)
        raise
    except FileNotFetchedException as err:
        logger.error(str(err), exc_info=True)
        raise

    vectors_df = get_vectors_df(playcounts_df)

    similarity_matrix = Correlation.corr(
        vectors_df, 'vector', 'pearson').first()['pearson(vector)'].toArray()
    similar_users = threshold_similar_users(similarity_matrix, max_num_users)

    # Due to an unresolved bug in Spark (https://issues.apache.org/jira/browse/SPARK-10925), we cannot join twice on
    # the same dataframe. Hence, we create a modified dataframe with the columns renamed.
    other_users_df = users_df\
        .withColumnRenamed('user_id', 'other_user_id')\
        .withColumnRenamed('user_name', 'other_user_name')

    similar_users_df = listenbrainz_spark.session.createDataFrame(similar_users, ['user_id', 'other_user_id',
        'similarity', 'global_similarity'])\
        .join(users_df, 'user_id', 'inner')\
        .join(other_users_df, 'other_user_id', 'inner')\
        .select('user_name', struct('other_user_name', 'similarity', 'global_similarity').alias('similar_user'))\
        .groupBy('user_name')\
        .agg(collect_list('similar_user').alias('similar_users'))

    logger.info('Finishing generating similar user matrix')

    return create_messages(similar_users_df)
def main(recommendation_top_artist_limit=None,
         recommendation_similar_artist_limit=None,
         users=None):

    try:
        listenbrainz_spark.init_spark_session('Recommendations')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    try:
        recordings_df = utils.read_files_from_HDFS(
            path.RECORDINGS_DATAFRAME_PATH)
        top_artist_candidate_set_df = utils.read_files_from_HDFS(
            path.TOP_ARTIST_CANDIDATE_SET)
        similar_artist_candidate_set_df = utils.read_files_from_HDFS(
            path.SIMILAR_ARTIST_CANDIDATE_SET)
    except PathNotFoundException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    current_app.logger.info('Loading model...')
    model = load_model()

    # an action must be called to persist data in memory
    recordings_df.count()
    recordings_df.persist()

    params = RecommendationParams(recordings_df, model,
                                  top_artist_candidate_set_df,
                                  similar_artist_candidate_set_df,
                                  recommendation_top_artist_limit,
                                  recommendation_similar_artist_limit)

    messages = get_recommendations_for_all(params, users)
    # persisted data must be cleared from memory after usage to avoid OOM
    recordings_df.unpersist()

    return messages
Ejemplo n.º 10
0
def main():
    ti = time()
    try:
        listenbrainz_spark.init_spark_session('Create_Dataframe')
    except AttributeError as err:
        logging.error(
            'Cannot initialize Spark Session: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occurred while initializing Spark session: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)

    df = None
    missing_parquets = []
    for y in range(config.STARTING_YEAR, config.ENDING_YEAR + 1):
        for m in range(config.STARTING_MONTH, config.ENDING_MONTH + 1):
            try:
                month = listenbrainz_spark.sql_context.read.parquet(
                    '{}/data/listenbrainz/{}/{}.parquet'.format(
                        config.HDFS_CLUSTER_URI, y, m))
                df = df.union(month) if df else month
            except AnalysisException as err:
                missing_parquets.append('{}-{}'.format(y, '{:02d}'.format(m)))
                logging.error(
                    'Cannot read parquet files from HDFS: {} \n {}'.format(
                        type(err).__name__, str(err)))
                continue
            except Exception as err:
                logging.error(
                    'An error occured while fetching \"/data/listenbrainz/{}/{}.parquet\": {} \n {}.              Aborting...'
                    .format(y, m,
                            type(err).__name__, str(err)),
                    exc_info=True)
                sys.exit(-1)

    if not df:
        raise SystemExit("Parquet files from {}-{} to {}-{} are empty".format(
            config.STARTING_YEAR, '{:02d}'.format(config.STARTING_MONTH),
            config.ENDING_YEAR, '{:02d}'.format(config.ENDING_MONTH)))

    logging.info('Registering Dataframe...')
    table = 'df_to_train_{}'.format(
        datetime.strftime(datetime.utcnow(), '%Y_%m_%d'))
    try:
        df.createOrReplaceTempView(table)
    except AnalysisException as err:
        logging.error(
            'Cannot register dataframe: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while registering dataframe: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)
    t = '{:.2f}'.format(time() - ti)
    logging.info(
        'Files fectched from HDFS and dataframe registered in {}s'.format(t))

    dest_path = os.path.join(config.HDFS_CLUSTER_URI, 'data', 'listenbrainz',
                             'recommendation-engine', 'dataframes')

    logging.info('Preparing users data and saving to HDFS...')
    try:
        t0 = time()
        users_df = prepare_user_data(table)
        users_df.write.format('parquet').save(dest_path + '/users_df.parquet',
                                              mode='overwrite')
    except QueryExecutionException as err:
        logging.error(
            'Failed to execute users query: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except AnalysisException as err:
        logging.error(
            'Failed to analyse users query plan: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except ParseException as err:
        logging.error(
            "Failed to parse SQL command: {} \n {}. Aborting...".format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occurred while executing users query: {} \n {}. Aborting'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)
    users_df_time = '{:.2f}'.format((time() - t0) / 60)

    logging.info('Preparing recordings data and saving to HDFS...')
    try:
        t0 = time()
        recordings_df = prepare_recording_data(table)
        recordings_df.write.format('parquet').save(dest_path +
                                                   '/recordings_df.parquet',
                                                   mode='overwrite')
    except QueryExecutionException as err:
        logging.error(
            'Failed to execute recordings query: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except AnalysisException as err:
        logging.error(
            'Failed to analyse recordings query plan: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err)))
        sys.exit(-1)
    except ParseException as err:
        logging.error(
            'Failed to parse SQL command: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occurred while executing recordings query: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)
    recordings_df_time = '{:.2f}'.format((time() - t0) / 60)

    logging.info(
        'Preparing listen data dump and playcounts, saving playcounts to HDFS...'
    )
    try:
        t0 = time()
        listens_df = prepare_listen_data(table)
        playcounts_df = get_playcounts_data(listens_df, users_df,
                                            recordings_df)
        playcounts_df.write.format('parquet').save(dest_path +
                                                   '/playcounts_df.parquet',
                                                   mode='overwrite')
    except QueryExecutionException as err:
        logging.error(
            'Failed to execute playcounts query: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except AnalysisException as err:
        logging.error(
            'Failed to analyse playcounts query plan: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err)))
        sys.exit(-1)
    except ParseException as err:
        logging.error(
            'Failed to parse SQL command: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error('An error occurred. {} \n {}. Aborting...'.format(
            type(err).__name__, str(err)),
                      exc_info=True)
        sys.exit(-1)
    playcounts_df_time = '{:.2f}'.format((time() - t0) / 60)

    total_time = '{:.2f}'.format((time() - ti) / 60)
    lb_dump_time_window = ('{}-{}'.format(
        config.STARTING_YEAR,
        '{:02d}'.format(config.STARTING_MONTH)), '{}-{}'.format(
            config.ENDING_YEAR, '{:02d}'.format(config.ENDING_MONTH)))
    date = datetime.utcnow().strftime('%Y-%m-%d')
    queries_html = 'Queries-{}-{}.html'.format(uuid.uuid4(), date)
    context = {
        'users_df_time': users_df_time,
        'recordings_df_time': recordings_df_time,
        'playcounts_df_time': playcounts_df_time,
        'lb_dump_time_window': lb_dump_time_window,
        'missing_parquets': missing_parquets,
        'total_time': total_time
    }
    utils.save_html(queries_html, context, 'queries.html')
Ejemplo n.º 11
0
def main():
    ti = time()
    try:
        listenbrainz_spark.init_spark_session('Train_Models')
    except AttributeError as err:
        logging.error(
            'Cannot initialize Spark Session: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occurred while initializing Spark session: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)

    try:
        path = os.path.join('/', 'data', 'listenbrainz',
                            'recommendation-engine', 'dataframes',
                            'playcounts_df.parquet')
        playcounts_df = listenbrainz_spark.sql_context.read.parquet(
            config.HDFS_CLUSTER_URI + path)
    except AnalysisException as err:
        logging.error('Cannot read parquet file from HDFS: {} \n {}'.format(
            type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while fetching parquet: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)
    time_info = {}
    time_info['load_playcounts'] = '{:.2f}'.format((time() - ti) / 60)

    t0 = time()
    training_data, validation_data, test_data = preprocess_data(playcounts_df)
    time_info['preprocessing'] = '{:.2f}'.format((time() - t0) / 60)

    # Rdds that are used in model training iterative process are cached to improve performance.
    # Caching large files may cause Out of Memory exception.
    training_data.persist()
    validation_data.persist()
    num_training = training_data.count()
    num_validation = validation_data.count()
    num_test = test_data.count()
    logging.info('Training models...')

    try:
        t0 = time()
        model, model_metadata, best_model_metadata = train(
            training_data, validation_data, num_validation, config.RANKS,
            config.LAMBDAS, config.ITERATIONS)
        models_training_time = '{:.2f}'.format((time() - t0) / 3600)
    except Py4JJavaError as err:
        logging.error('Unable to train models: {} \n {}. Aborting...'.format(
            type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while training models: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err), exc_info=True))
        sys.exit(-1)

    training_data.unpersist()
    validation_data.unpersist()

    logging.info('Saving model...')
    try:
        t0 = time()
        path = os.path.join('/', 'data', 'listenbrainz',
                            'recommendation-engine', 'best-model',
                            '{}'.format(best_model_metadata['model_id']))
        model.model.save(listenbrainz_spark.context,
                         config.HDFS_CLUSTER_URI + path)
        time_info['save_model'] = '{:.2f}'.format((time() - t0) / 60)
    except Py4JJavaError as err:
        logging.error("Unable to save model: {} \n {}. Aborting...".format(
            type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while saving model: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err), exc_info=True))
        sys.exit(-1)

    date = datetime.utcnow().strftime('%Y-%m-%d')
    model_html = 'Model-{}-{}.html'.format(uuid.uuid4(), date)
    context = {
        'time': time_info,
        'num_training': '{:,}'.format(num_training),
        'num_validation': '{:,}'.format(num_validation),
        'num_test': '{:,}'.format(num_test),
        'models': model_metadata,
        'best_model': best_model_metadata,
        'models_training_time': models_training_time,
        'total_time': '{:.2f}'.format((time() - ti) / 3600)
    }

    utils.save_html(model_html, context, 'model.html')
    path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                        'recommendation-metadata.json')
    with open(path, 'r') as f:
        recommendation_metadata = json.load(f)
        recommendation_metadata['best_model_id'] = best_model_metadata[
            'model_id']

    with open(path, 'w') as f:
        json.dump(recommendation_metadata, f)
def main(app_name):
    listenbrainz_spark.init_spark_session(app_name)
    RequestConsumer().run()
Ejemplo n.º 13
0
def main(app_name, archive):
    listenbrainz_spark.init_spark_session(app_name)
    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    print('Copying extracted dump to HDFS...')
    copy_to_hdfs(archive)
    print('Done!')
Ejemplo n.º 14
0
def main(app_name):
    listenbrainz_spark.init_spark_session(app_name)
    global rc
    rc = RequestConsumer()
    rc.run()
Ejemplo n.º 15
0
def calculate_dataframes(from_date, to_date, job_type,
                         minimum_listens_threshold):
    if job_type == "recommendation_recording":
        paths = {
            "mapped_listens": path.RECOMMENDATION_RECORDING_MAPPED_LISTENS,
            "playcounts": path.RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME,
            "recordings": path.RECOMMENDATION_RECORDINGS_DATAFRAME,
            "users": path.RECOMMENDATION_RECORDING_USERS_DATAFRAME,
            "metadata": path.RECOMMENDATION_RECORDING_DATAFRAME_METADATA,
            "prefix": "listenbrainz-dataframe-recording-recommendations"
        }
    elif job_type == "similar_users":
        paths = {
            "mapped_listens": path.USER_SIMILARITY_MAPPED_LISTENS,
            "playcounts": path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME,
            "recordings": path.USER_SIMILARITY_RECORDINGS_DATAFRAME,
            "users": path.USER_SIMILARITY_USERS_DATAFRAME,
            "metadata": path.USER_SIMILARITY_METADATA_DATAFRAME,
            "prefix": "listenbrainz-dataframe-user-similarity"
        }
    else:
        raise SparkException(
            "Invalid job_type parameter received for creating dataframes: " +
            job_type)

    # dict to save dataframe metadata which would be later merged in model_metadata dataframe.
    metadata = {}
    # "updated" should always be set to False in this script.
    metadata['updated'] = False
    try:
        listenbrainz_spark.init_spark_session('Create Dataframes')
    except SparkSessionNotInitializedException as err:
        logger.error(str(err), exc_info=True)
        raise

    metadata['to_date'] = to_date
    metadata['from_date'] = from_date

    complete_listens_df = get_listens_from_new_dump(from_date, to_date)
    logger.info(
        f'Listen count from {from_date} to {to_date}: {complete_listens_df.count()}'
    )

    logger.info('Discarding listens without mbids...')
    partial_listens_df = complete_listens_df.where(
        col('recording_mbid').isNotNull())
    logger.info(f'Listen count after discarding: {partial_listens_df.count()}')

    logger.info('Thresholding listens...')
    threshold_listens_df = get_threshold_listens_df(partial_listens_df,
                                                    paths["mapped_listens"],
                                                    minimum_listens_threshold)
    logger.info(
        f'Listen count after thresholding: {threshold_listens_df.count()}')

    logger.info('Preparing users data and saving to HDFS...')
    users_df = get_users_dataframe(threshold_listens_df, metadata,
                                   paths["users"])

    logger.info('Preparing recordings data and saving to HDFS...')
    recordings_df = get_recordings_df(threshold_listens_df, metadata,
                                      paths["recordings"])

    logger.info(
        'Preparing listen data dump and playcounts, saving playcounts to HDFS...'
    )
    listens_df = get_listens_df(threshold_listens_df, metadata)

    save_playcounts_df(listens_df, recordings_df, users_df, metadata,
                       paths["playcounts"])

    metadata['dataframe_id'] = get_dataframe_id(paths["prefix"])
    save_dataframe_metadata_to_hdfs(metadata, paths["metadata"])
    return complete_listens_df
Ejemplo n.º 16
0
def main():
    ti = time()
    try:
        listenbrainz_spark.init_spark_session('Candidate_set')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    df = get_listens_for_rec_generation_window()

    if not df:
        current_app.logger.error(
            'Listening history of past {} days do not exist'.format(
                config.RECOMMENDATION_GENERATION_WINDOW))

    try:
        utils.register_dataframe(df, 'df')
    except ViewNotRegisteredException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    try:
        listens_df = sql.get_listens_for_X_days()
    except SQLException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    try:
        artists_relation_df = utils.read_files_from_HDFS(
            path.SIMILAR_ARTIST_DATAFRAME_PATH)
        recordings_df = utils.read_files_from_HDFS(
            path.RECORDINGS_DATAFRAME_PATH)
        users_df = utils.read_files_from_HDFS(path.USERS_DATAFRAME_PATH)
    except PathNotFoundException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    current_app.logger.info('Registering Dataframes...')
    try:
        utils.register_dataframe(listens_df, 'listens_df')
        utils.register_dataframe(recordings_df, 'recording')
        utils.register_dataframe(users_df, 'user')
        utils.register_dataframe(artists_relation_df, 'artists_relation')
    except ViewNotRegisteredException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    current_app.logger.info(
        'Files fetched from HDFS and dataframes registered in {}s'.format(
            '{:.2f}'.format(time() - ti)))

    metadata_file_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        'recommendation-metadata.json')
    with open(metadata_file_path) as f:
        recommendation_metadata = json.load(f)
        user_names = recommendation_metadata['user_name']

    user_data = defaultdict(dict)
    similar_artists_candidate_set_df = None
    top_artists_candidate_set_df = None
    for user_name in user_names:
        ts = time()
        try:
            user_id = get_user_id(user_name)
        except TypeError as err:
            current_app.logger.error(
                '{}: Invalid user name. User "{}" does not exist.'.format(
                    type(err).__name__, user_name))
            continue
        except SQLException as err:
            current_app.logger.error(
                'User id for "{}" cannot be retrieved\n{}'.format(
                    user_name, str(err)),
                exc_info=True)
            continue

        try:
            top_artists_df = sql.get_top_artists(user_name)
            top_artists_df.take(1)[0]
        except IndexError as err:
            current_app.logger.error('{}: {}\nNo top artists found, i.e. "{}" is either a new user or has empty listening history.' \
                ' Candidate sets cannot be generated'.format(type(err).__name__, str(err), user_name))
            continue
        except SQLException as err:
            current_app.logger.error(
                'Top artists cannot be retrieved for "{}": {}\n{}'.format(
                    user_name, str(err)),
                exc_info=True)
            continue

        try:
            similar_artists_df = get_similar_artists(top_artists_df, user_name)
        except IndexError as err:
            current_app.logger.error(
                '{}\nGenrating recommendations for next user'.format(err))
            continue
        except SQLException as err:
            current_app.logger.error(
                'Candidate sets not generated for "{}"\n{}'.format(
                    user_name, str(err)),
                exc_info=True)
            continue

        try:
            utils.register_dataframe(similar_artists_df, 'similar_artist')
            utils.register_dataframe(top_artists_df, 'top_artist')
        except ViewNotRegisteredException as err:
            current_app.logger.error(str(err), exc_info=True)
            continue

        try:
            top_artists_recording_ids_df = get_top_artists_recording_ids(
                similar_artists_df, user_name, user_id)
        except SQLException as err:
            current_app.logger.error(
                'Candidate sets could not be generated for "{}"\n{}'.format(
                    user_name, str(err)),
                exc_info=True)
            continue
        top_artists_candidate_set_df = top_artists_candidate_set_df.union(top_artists_recording_ids_df) \
            if top_artists_candidate_set_df else top_artists_recording_ids_df

        try:
            similar_artists_recording_ids_df = get_similar_artists_recording_ids(
                similar_artists_df, top_artists_df, user_name, user_id)
        except IndexError as err:
            current_app.logger.error(
                '{}\nGenrating recommendations for next user'.format(err))
            continue
        except SQLException as err:
            current_app.logger.error(
                'Candidate sets could not be generated for "{}"\n{}'.format(
                    user_name, str(err)),
                exc_info=True)
            continue
        similar_artists_candidate_set_df = similar_artists_candidate_set_df.union(similar_artists_recording_ids_df) \
            if similar_artists_candidate_set_df else similar_artists_recording_ids_df

        if SAVE_CANDIDATE_HTML:
            user_data[user_name]['artists'] = get_candidate_html_data(
                similar_artists_df, user_name)
            user_data[user_name]['time'] = '{:.2f}'.format(time() - ts)
        current_app.logger.info(
            'candidate_set generated for \"{}\"'.format(user_name))

    try:
        save_candidate_sets(top_artists_candidate_set_df,
                            similar_artists_candidate_set_df)
    except Py4JJavaError as err:
        current_app.logger.error('{}\nAborting...'.format(
            str(err.java_exception)),
                                 exc_info=True)
        sys.exit(-1)

    if SAVE_CANDIDATE_HTML:
        try:
            save_candidate_html(user_data)
        except SQLException as err:
            current_app.logger.error(
                'Could not save candidate HTML\n{}'.format(str(err)),
                exc_info=True)
            sys.exit(-1)
Ejemplo n.º 17
0
def main(recommendation_top_artist_limit=None,
         recommendation_similar_artist_limit=None,
         users=None):

    try:
        listenbrainz_spark.init_spark_session('Recommendations')
    except SparkSessionNotInitializedException as err:
        logger.error(str(err), exc_info=True)
        raise

    try:
        recordings_df = utils.read_files_from_HDFS(
            path.RECOMMENDATION_RECORDINGS_DATAFRAME)
        top_artist_candidate_set_df = utils.read_files_from_HDFS(
            path.RECOMMENDATION_RECORDING_TOP_ARTIST_CANDIDATE_SET)
        similar_artist_candidate_set_df = utils.read_files_from_HDFS(
            path.RECOMMENDATION_RECORDING_SIMILAR_ARTIST_CANDIDATE_SET)
    except PathNotFoundException as err:
        logger.error(str(err), exc_info=True)
        raise
    except FileNotFetchedException as err:
        logger.error(str(err), exc_info=True)
        raise

    logger.info('Loading model...')
    model = load_model()

    # an action must be called to persist data in memory
    recordings_df.count()
    recordings_df.persist()

    params = RecommendationParams(recordings_df, model,
                                  top_artist_candidate_set_df,
                                  similar_artist_candidate_set_df,
                                  recommendation_top_artist_limit,
                                  recommendation_similar_artist_limit)

    try:
        # timestamp when the script was invoked
        ts_initial = time.monotonic()
        users_df = get_user_name_and_user_id(params, users)
        # Some users are excluded from the top_artist_candidate_set because of the limited data
        # in the mapping. Therefore, active_user_count may or may not be equal to number of users
        # active in the last week. Ideally, top_artist_candidate_set should give the active user count.
        active_user_count = users_df.count()
        users_df.persist()
        logger.info(
            'Took {:.2f}sec to get active user count'.format(time.monotonic() -
                                                             ts_initial))
    except EmptyDataframeExcpetion as err:
        logger.error(str(err), exc_info=True)
        raise

    logger.info('Generating recommendations...')
    ts = time.monotonic()
    top_artist_rec_df, similar_artist_rec_df = get_recommendations_for_all(
        params, users)
    logger.info('Recommendations generated!')
    logger.info(
        'Took {:.2f}sec to generate recommendations for all active users'.
        format(time.monotonic() - ts))

    ts = time.monotonic()
    top_artist_rec_user_count = get_user_count(top_artist_rec_df)
    similar_artist_rec_user_count = get_user_count(similar_artist_rec_df)
    logger.info(
        'Took {:.2f}sec to get top artist and similar artist user count'.
        format(time.monotonic() - ts))

    ts = time.monotonic()
    check_for_ratings_beyond_range(top_artist_rec_df, similar_artist_rec_df)

    top_artist_rec_scaled_df = scale_rating(top_artist_rec_df)
    similar_artist_rec_scaled_df = scale_rating(similar_artist_rec_df)
    logger.info('Took {:.2f}sec to scale the ratings'.format(time.monotonic() -
                                                             ts))

    ts = time.monotonic()
    top_artist_rec_mbid_df = get_recording_mbids(params,
                                                 top_artist_rec_scaled_df,
                                                 users_df)
    similar_artist_rec_mbid_df = get_recording_mbids(
        params, similar_artist_rec_scaled_df, users_df)
    logger.info(
        'Took {:.2f}sec to get mbids corresponding to recording ids'.format(
            time.monotonic() - ts))

    # persisted data must be cleared from memory after usage to avoid OOM
    recordings_df.unpersist()

    total_time = time.monotonic() - ts_initial
    logger.info('Total time: {:.2f}sec'.format(total_time))

    result = create_messages(top_artist_rec_mbid_df,
                             similar_artist_rec_mbid_df, active_user_count,
                             total_time, top_artist_rec_user_count,
                             similar_artist_rec_user_count)

    users_df.unpersist()

    return result
Ejemplo n.º 18
0
def main(recommendation_generation_window=None,
         top_artist_limit=None,
         similar_artist_limit=None,
         users=None,
         html_flag=False):

    time_initial = time.monotonic()
    try:
        listenbrainz_spark.init_spark_session('Candidate_set')
    except SparkSessionNotInitializedException as err:
        logger.error(str(err), exc_info=True)
        raise

    try:
        mapped_listens_df = utils.read_files_from_HDFS(
            path.RECOMMENDATION_RECORDING_MAPPED_LISTENS)
        recordings_df = utils.read_files_from_HDFS(
            path.RECOMMENDATION_RECORDINGS_DATAFRAME)
        users_df = utils.read_files_from_HDFS(
            path.RECOMMENDATION_RECORDING_USERS_DATAFRAME)
        artist_relation_df = utils.read_files_from_HDFS(
            path.SIMILAR_ARTIST_DATAFRAME_PATH)
    except PathNotFoundException as err:
        logger.error(str(err), exc_info=True)
        raise
    except FileNotFetchedException as err:
        logger.error(str(err), exc_info=True)
        raise

    from_date, to_date = get_dates_to_generate_candidate_sets(
        mapped_listens_df, recommendation_generation_window)

    logger.info('Fetching listens to get top artists...')
    mapped_listens_subset = get_listens_to_fetch_top_artists(
        mapped_listens_df, from_date, to_date)

    logger.info('Fetching top artists...')
    top_artist_df = get_top_artists(mapped_listens_subset, top_artist_limit,
                                    users)

    logger.info('Preparing top artists candidate set...')
    top_artist_candidate_set_df, top_artist_candidate_set_df_html = get_top_artist_candidate_set(
        top_artist_df, recordings_df, users_df, mapped_listens_subset)

    logger.info('Fetching similar artists...')
    similar_artist_df, similar_artist_df_html = get_similar_artists(
        top_artist_df, artist_relation_df, similar_artist_limit)

    logger.info('Preparing similar artists candidate set...')
    similar_artist_candidate_set_df, similar_artist_candidate_set_df_html = get_similar_artist_candidate_set(
        similar_artist_df, recordings_df, users_df, mapped_listens_subset)

    logger.info('Saving candidate sets...')
    save_candidate_sets(top_artist_candidate_set_df,
                        similar_artist_candidate_set_df)
    logger.info('Done!')

    # time taken to generate candidate_sets
    total_time = '{:.2f}'.format((time.monotonic() - time_initial) / 60)
    if html_flag:
        user_data = get_candidate_html_data(
            similar_artist_candidate_set_df_html,
            top_artist_candidate_set_df_html, top_artist_df,
            similar_artist_df_html)

        logger.info('Saving HTML...')
        save_candidate_html(user_data, total_time, from_date, to_date)
        logger.info('Done!')

    message = [{
        'type': 'cf_recommendations_recording_candidate_sets',
        'candidate_sets_upload_time': str(datetime.utcnow()),
        'total_time': total_time,
        'from_date': str(from_date),
        'to_date': str(to_date)
    }]

    return message
Ejemplo n.º 19
0
def main():
    ti = time()
    time_ = defaultdict(dict)
    try:
        listenbrainz_spark.init_spark_session('Train Models')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    # Add checkpoint dir to break and save RDD lineage.
    listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI +
                                                path.CHECKPOINT_DIR)

    try:
        playcounts_df = utils.read_files_from_HDFS(
            path.PLAYCOUNTS_DATAFRAME_PATH)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    time_['load_playcounts'] = '{:.2f}'.format((time() - ti) / 60)

    t0 = time()
    training_data, validation_data, test_data = preprocess_data(playcounts_df)
    time_['preprocessing'] = '{:.2f}'.format((time() - t0) / 60)

    # Rdds that are used in model training iterative process are cached to improve performance.
    # Caching large files may cause Out of Memory exception.
    training_data.persist()
    validation_data.persist()

    # An action must be called for persist to evaluate.
    num_training = training_data.count()
    num_validation = validation_data.count()
    num_test = test_data.count()

    current_app.logger.info('Training models...')
    t0 = time()
    model, model_metadata, best_model_metadata = train(
        training_data, validation_data, num_validation, config.RANKS,
        config.LAMBDAS, config.ITERATIONS)
    models_training_time = '{:.2f}'.format((time() - t0) / 3600)

    try:
        best_model_test_rmse = compute_rmse(model.model, test_data, num_test)
    except Py4JJavaError as err:
        current_app.logger.error(
            'Root mean squared error for best model for test data not computed\n{}\nAborting...'
            .format(str(err.java_exception)),
            exc_info=True)
        sys.exit(-1)

    # Cached data must be cleared to avoid OOM.
    training_data.unpersist()
    validation_data.unpersist()

    current_app.logger.info('Saving model...')
    t0 = time()
    model_save_path = os.path.join(path.DATA_DIR,
                                   best_model_metadata['model_id'])
    save_model(model_save_path, best_model_metadata['model_id'], model)
    time_['save_model'] = '{:.2f}'.format((time() - t0) / 60)

    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway.
    try:
        utils.delete_dir(path.CHECKPOINT_DIR, recursive=True)
    except HDFSDirectoryNotDeletedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    if SAVE_TRAINING_HTML:
        save_training_html(time_, num_training, num_validation, num_test,
                           model_metadata, best_model_metadata, ti,
                           models_training_time)

    # Save best model id to a JSON file
    metadata_file_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        'recommendation-metadata.json')
    with open(metadata_file_path, 'r') as f:
        recommendation_metadata = json.load(f)
        recommendation_metadata['best_model_id'] = best_model_metadata[
            'model_id']
    with open(metadata_file_path, 'w') as f:
        json.dump(recommendation_metadata, f)
Ejemplo n.º 20
0
def main():
    ti = time()
    try:
        listenbrainz_spark.init_spark_session('Candidate_set')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    try:
        mapped_df = utils.read_files_from_HDFS(path.MAPPED_LISTENS)
        recordings_df = utils.read_files_from_HDFS(
            path.RECORDINGS_DATAFRAME_PATH)
        users_df = utils.read_files_from_HDFS(path.USERS_DATAFRAME_PATH)
        artists_relation_df = utils.read_files_from_HDFS(
            path.SIMILAR_ARTIST_DATAFRAME_PATH)
    except PathNotFoundException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    listens_df = get_listens_for_rec_generation_window(mapped_df)

    metadata_file_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        'recommendation-metadata.json')
    with open(metadata_file_path) as f:
        recommendation_metadata = json.load(f)
        user_names = recommendation_metadata['user_name']

    user_data = defaultdict(dict)
    similar_artists_candidate_set_df = None
    top_artists_candidate_set_df = None
    for user_name in user_names:
        ts = time()
        try:
            user_id = get_user_id(users_df, user_name)
        except IndexError:
            current_app.logger.error(
                '{} is new/invalid user'.format(user_name))
            continue

        top_artists_df = get_top_artists(listens_df, user_name)

        top_artists_recording_ids_df = get_top_artists_recording_ids(
            top_artists_df, recordings_df, user_id)
        top_artists_candidate_set_df = top_artists_candidate_set_df.union(top_artists_recording_ids_df) \
            if top_artists_candidate_set_df else top_artists_recording_ids_df

        try:
            similar_artists_df = get_similar_artists(top_artists_df,
                                                     artists_relation_df,
                                                     user_name)
        except IndexError:
            continue

        similar_artists_recording_ids_df = get_similar_artists_recording_ids(
            similar_artists_df, recordings_df, user_id)
        similar_artists_candidate_set_df = similar_artists_candidate_set_df.union(similar_artists_recording_ids_df) \
            if similar_artists_candidate_set_df else similar_artists_recording_ids_df

        if SAVE_CANDIDATE_HTML:
            user_data[user_name]['artists'] = get_candidate_html_data(
                similar_artists_df, user_name)
            user_data[user_name]['time'] = '{:.2f}'.format(time() - ts)
        current_app.logger.info(
            'candidate_set generated for \"{}\"'.format(user_name))

    try:
        save_candidate_sets(top_artists_candidate_set_df,
                            similar_artists_candidate_set_df)
    except Py4JJavaError as err:
        current_app.logger.error('{}\nAborting...'.format(
            str(err.java_exception)),
                                 exc_info=True)
        sys.exit(-1)

    if SAVE_CANDIDATE_HTML:
        try:
            save_candidate_html(user_data)
        except SQLException as err:
            current_app.logger.error(
                'Could not save candidate HTML\n{}'.format(str(err)),
                exc_info=True)
            sys.exit(-1)
Ejemplo n.º 21
0
 def __init__(self):
     try:
         listenbrainz_spark.init_spark_session('uploader')
     except SparkSessionNotInitializedException as err:
         logger.error(str(err), exc_info=True)
         sys.exit(-1)
Ejemplo n.º 22
0
def main(ranks=None, lambdas=None, iterations=None, alpha=None):

    if ranks is None:
        current_app.logger.critical('model param "ranks" missing')

    if lambdas is None:
        current_app.logger.critical('model param "lambdas" missing')
        raise

    if iterations is None:
        current_app.logger.critical('model param "iterations" missing')
        raise

    if alpha is None:
        current_app.logger.critical('model param "alpha" missing')
        raise

    ti = time.monotonic()
    time_ = defaultdict(dict)
    try:
        listenbrainz_spark.init_spark_session('Train Models')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    # Add checkpoint dir to break and save RDD lineage.
    listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI +
                                                path.CHECKPOINT_DIR)

    try:
        playcounts_df = utils.read_files_from_HDFS(
            path.PLAYCOUNTS_DATAFRAME_PATH)
        dataframe_metadata_df = utils.read_files_from_HDFS(
            path.DATAFRAME_METADATA)
    except PathNotFoundException as err:
        current_app.logger.error(
            '{}\nConsider running create_dataframes.py'.format(str(err)),
            exc_info=True)
        raise
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    time_['load_playcounts'] = '{:.2f}'.format((time.monotonic() - ti) / 60)

    t0 = time.monotonic()
    training_data, validation_data, test_data = preprocess_data(playcounts_df)
    time_['preprocessing'] = '{:.2f}'.format((time.monotonic() - t0) / 60)

    # An action must be called for persist to evaluate.
    num_training = training_data.count()
    num_validation = validation_data.count()
    num_test = test_data.count()

    t0 = time.monotonic()
    best_model, model_metadata = get_best_model(training_data, validation_data,
                                                num_validation, ranks, lambdas,
                                                iterations, alpha)
    models_training_time = '{:.2f}'.format((time.monotonic() - t0) / 3600)

    best_model_metadata = get_best_model_metadata(best_model)
    current_app.logger.info(
        "Calculating test RMSE for best model with model id: {}".format(
            best_model.model_id))
    best_model_metadata['test_rmse'] = compute_rmse(best_model.model,
                                                    test_data, num_test,
                                                    best_model.model_id)
    current_app.logger.info("Test RMSE calculated!")

    best_model_metadata['training_data_count'] = num_training
    best_model_metadata['validation_data_count'] = num_validation
    best_model_metadata['test_data_count'] = num_test
    best_model_metadata['dataframe_id'] = get_latest_dataframe_id(
        dataframe_metadata_df)

    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    t0 = time.monotonic()
    save_model(best_model.model_id, best_model.model)
    time_['save_model'] = '{:.2f}'.format((time.monotonic() - t0) / 60)

    save_model_metadata_to_hdfs(best_model_metadata)
    # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway.
    try:
        utils.delete_dir(path.CHECKPOINT_DIR, recursive=True)
    except HDFSDirectoryNotDeletedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    if SAVE_TRAINING_HTML:
        current_app.logger.info('Saving HTML...')
        save_training_html(time_, num_training, num_validation, num_test,
                           model_metadata, best_model_metadata, ti,
                           models_training_time)
        current_app.logger.info('Done!')

    message = [{
        'type': 'cf_recording_model',
        'model_upload_time': str(datetime.utcnow()),
        'total_time': '{:.2f}'.format(time.monotonic() - ti),
    }]

    return message
def main():
    ti = time()
    # dict to save dataframe metadata which would be later merged in model_metadata dataframe.
    metadata = {}
    # "updated" should always be set to False in this script.
    metadata['updated'] = False
    try:
        listenbrainz_spark.init_spark_session('Create Dataframes')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    df = get_listens_for_training_model_window(metadata)

    if not df:
        current_app.logger.error(
            'Parquet files containing listening history of past {} days missing form HDFS'
            .format(config.TRAIN_MODEL_WINDOW))
        sys.exit(-1)

    current_app.logger.info('Registering Dataframe...')
    table = 'df_to_train_{}'.format(
        datetime.strftime(datetime.utcnow(), '%Y_%m_%d'))
    try:
        utils.register_dataframe(df, table)
    except ViewNotRegisteredException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    current_app.logger.info(
        'Files fetched from HDFS and dataframe registered in {}s'.format(
            '{:.2f}'.format(time() - ti)))

    current_app.logger.info('Preparing users data and saving to HDFS...')
    t0 = time()
    try:
        users_df = sql.prepare_user_data(table)
    except SQLException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    metadata['users_count'] = users_df.count()

    try:
        utils.save_parquet(users_df, path.USERS_DATAFRAME_PATH)
    except FileNotSavedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    users_df_time = '{:.2f}'.format((time() - t0) / 60)

    current_app.logger.info('Preparing recordings data and saving to HDFS...')
    t0 = time()
    try:
        recordings_df = sql.prepare_recording_data(table)
    except SQLException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    metadata['recordings_count'] = recordings_df.count()

    try:
        utils.save_parquet(recordings_df, path.RECORDINGS_DATAFRAME_PATH)
    except FileNotSavedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    recordings_df_time = '{:.2f}'.format((time() - t0) / 60)

    current_app.logger.info(
        'Preparing listen data dump and playcounts, saving playcounts to HDFS...'
    )
    t0 = time()
    try:
        listens_df = sql.prepare_listen_data(table)
    except SQLException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    metadata['listens_count'] = listens_df.count()

    try:
        utils.register_dataframe(listens_df, 'listen')
        utils.register_dataframe(users_df, 'user')
        utils.register_dataframe(recordings_df, 'recording')
    except ViewNotRegisteredException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    try:
        playcounts_df = sql.get_playcounts_data()
    except SQLException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    metadata['playcounts_count'] = playcounts_df.count()

    try:
        utils.save_parquet(playcounts_df, path.PLAYCOUNTS_DATAFRAME_PATH)
    except FileNotSavedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    playcounts_df_time = '{:.2f}'.format((time() - t0) / 60)
    total_time = '{:.2f}'.format((time() - ti) / 60)

    generate_best_model_id(metadata)
    save_dataframe_metadata_to_HDFS(metadata)

    if SAVE_DATAFRAME_HTML:
        save_dataframe_html(users_df_time, recordings_df_time,
                            playcounts_df_time, total_time)
Ejemplo n.º 24
0
def main(train_model_window, job_type, minimum_listens_threshold=0):

    if job_type == "recommendation_recording":
        paths = {
            "mapped_listens": path.RECOMMENDATION_RECORDING_MAPPED_LISTENS,
            "playcounts": path.RECOMMENDATION_RECORDING_PLAYCOUNTS_DATAFRAME,
            "recordings": path.RECOMMENDATION_RECORDINGS_DATAFRAME,
            "users": path.RECOMMENDATION_RECORDING_USERS_DATAFRAME,
            "metadata": path.RECOMMENDATION_RECORDING_DATAFRAME_METADATA,
            "prefix": "listenbrainz-dataframe-recording-recommendations"
        }
    elif job_type == "similar_users":
        paths = {
            "mapped_listens": path.USER_SIMILARITY_MAPPED_LISTENS,
            "playcounts": path.USER_SIMILARITY_PLAYCOUNTS_DATAFRAME,
            "recordings": path.USER_SIMILARITY_RECORDINGS_DATAFRAME,
            "users": path.USER_SIMILARITY_USERS_DATAFRAME,
            "metadata": path.USER_SIMILARITY_METADATA_DATAFRAME,
            "prefix": "listenbrainz-dataframe-user-similarity"
        }
    else:
        raise SparkException("Invalid job_type parameter received for creating dataframes: " + job_type)

    ti = time.monotonic()
    # dict to save dataframe metadata which would be later merged in model_metadata dataframe.
    metadata = {}
    # "updated" should always be set to False in this script.
    metadata['updated'] = False
    try:
        listenbrainz_spark.init_spark_session('Create Dataframes')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    current_app.logger.info('Fetching listens to create dataframes...')
    to_date, from_date = get_dates_to_train_data(train_model_window)

    metadata['to_date'] = to_date
    metadata['from_date'] = from_date

    partial_listens_df = get_listens_for_training_model_window(to_date, from_date, path.LISTENBRAINZ_DATA_DIRECTORY)
    current_app.logger.info('Listen count from {from_date} to {to_date}: {listens_count}'
                            .format(from_date=from_date, to_date=to_date, listens_count=partial_listens_df.count()))

    current_app.logger.info('Loading mapping from HDFS...')
    df = utils.read_files_from_HDFS(path.MBID_MSID_MAPPING)
    msid_mbid_mapping_df = mapping_utils.get_unique_rows_from_mapping(df)
    current_app.logger.info('Number of distinct rows in the mapping: {}'.format(msid_mbid_mapping_df.count()))

    current_app.logger.info('Mapping listens...')
    mapped_listens_df = get_mapped_artist_and_recording_mbids(partial_listens_df, msid_mbid_mapping_df,
                                                              paths["mapped_listens"])
    current_app.logger.info('Listen count after mapping: {}'.format(mapped_listens_df.count()))

    current_app.logger.info('Preparing users data and saving to HDFS...')
    users_df = get_users_dataframe(mapped_listens_df, metadata, paths["users"])

    current_app.logger.info('Preparing recordings data and saving to HDFS...')
    recordings_df = get_recordings_df(mapped_listens_df, metadata, paths["recordings"])

    current_app.logger.info('Preparing listen data dump and playcounts, saving playcounts to HDFS...')
    listens_df = get_listens_df(mapped_listens_df, metadata)

    save_playcounts_df(listens_df, recordings_df, users_df, minimum_listens_threshold, metadata, paths["playcounts"])

    metadata['dataframe_id'] = get_dataframe_id(paths["prefix"])
    save_dataframe_metadata_to_hdfs(metadata, paths["metadata"])

    current_app.logger.info('Preparing missing MusicBrainz data...')
    missing_musicbrainz_data_itr = get_data_missing_from_musicbrainz(partial_listens_df, msid_mbid_mapping_df)

    messages = prepare_messages(missing_musicbrainz_data_itr, from_date, to_date, ti)

    return messages
Ejemplo n.º 25
0
def main():
    ti = time()
    try:
        listenbrainz_spark.init_spark_session('Recommendations')
    except AttributeError as err:
        logging.error(
            'Cannot initialize Spark Session: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occurred while initializing Spark session: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)

    try:
        path = os.path.join(config.HDFS_CLUSTER_URI, 'data', 'listenbrainz',
                            'recommendation-engine', 'dataframes')
        playcounts_df = listenbrainz_spark.sql_context.read.parquet(
            path + '/playcounts_df.parquet')
        users_df = listenbrainz_spark.sql_context.read.parquet(
            path + '/users_df.parquet')
        recordings_df = listenbrainz_spark.sql_context.read.parquet(
            path + '/recordings_df.parquet')
    except AnalysisException as err:
        logging.error('Cannot read parquet files from HDFS: {} \n {}'.format(
            type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while fetching parquets: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)

    time_info = defaultdict(dict)
    time_info['dataframes'] = '{:.2f}'.format((time() - ti) / 60)
    try:
        users_df.createOrReplaceTempView('user')
        playcounts_df.createOrReplaceTempView('playcount')
    except AnalysisException as err:
        logging.error(
            'Cannot register dataframes: {} \n {}. Aborting...'.format(
                type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while registering dataframes: {} \n {}. Aborting...'
            .format(type(err).__name__, str(err)),
            exc_info=True)
        sys.exit(-1)

    t0 = time()
    all_recordings = recordings_df.select('recording_id')
    all_recordings.persist()
    all_recordings_count = '{:,}'.format(all_recordings.count())
    time_info['all_recordings'] = '{:.2f}'.format((time() - t0) / 60)

    path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                        'recommendation-metadata.json')
    with open(path, 'r') as f:
        recommendation_metadata = json.load(f)
        best_model_id = recommendation_metadata['best_model_id']

    best_model_path = os.path.join('/', 'data', 'listenbrainz',
                                   'recommendation-engine', 'best-model',
                                   '{}'.format(best_model_id))

    logging.info('Loading model...')
    try:
        t0 = time()
        model = load_model(config.HDFS_CLUSTER_URI + best_model_path)
        time_info['load_model'] = '{:.2f}'.format((time() - t0) / 60)
    except Py4JJavaError as err:
        logging.error('Unable to load model: {} \n {}. Aborting...'.format(
            type(err).__name__, str(err)))
        sys.exit(-1)
    except Exception as err:
        logging.error(
            'An error occured while loading model: {} \n {}. Aborting...'.
            format(type(err).__name__, str(err), exc_info=True))
        sys.exit(-1)

    path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                        'recommendation-metadata.json')
    ts = time()
    with open(path) as f:
        recommendation_metadata = json.load(f)
        recommendations = defaultdict(dict)
        for user_name in recommendation_metadata['user_name']:
            try:
                t0 = time()
                user_recommendations = recommend_user(user_name, model,
                                                      all_recordings,
                                                      recordings_df)
                user_recommendations['total-time'] = '{:.2f}'.format(
                    (time() - t0) / 60)
                logging.info(
                    'Recommendations for "{}" generated'.format(user_name))
                recommendations[user_name] = user_recommendations
            except TypeError as err:
                logging.error(
                    '{}: Invalid user name. User "{}" does not exist.'.format(
                        type(err).__name__, user_name))
            except Exception as err:
                logging.error(
                    'Recommendations for "{}" not generated.\n{}'.format(
                        user_name, str(err)),
                    exc_info=True)
    time_info['total_recommendation_time'] = '{:.2f}'.format(
        (time() - ts) / 3600)

    all_recordings.unpersist()

    date = datetime.utcnow().strftime('%Y-%m-%d')
    recommendation_html = 'Recommendation-{}-{}.html'.format(
        uuid.uuid4(), date)
    column = ('Track Name', 'Recording MSID', 'Artist Name', 'Artist MSID',
              'Release Name', 'Release MSID')
    context = {
        'recommendations': recommendations,
        'column': column,
        'total_time': '{:.2f}'.format((time() - ti) / 3600),
        'time': time_info,
        'best_model': best_model_id,
        'all_recordings_count': all_recordings_count,
    }
    utils.save_html(recommendation_html, context, 'recommend.html')