def save_training_html(time_, num_training, num_validation, num_test, model_metadata, best_model_metadata, ti, models_training_time): """ Prepare and save taraining HTML. Args: time_ (dict): Dictionary containing execution time information. num_training (int): Number of elements/rows in training_data. num_validation (int): Number of elements/rows in validation_data. num_test (int): Number of elements/rows in test_data. model_metadata (dict): Models information such as model id, error etc. best_model_metadata (dict): Best Model information such as model id, error etc. ti (str): Value of the monotonic clock when the script was run. models_training_data (str): Time taken to train all the models. """ date = datetime.utcnow().strftime('%Y-%m-%d') model_html = 'Model-{}-{}.html'.format(uuid.uuid4(), date) context = { 'time': time_, 'num_training': '{:,}'.format(num_training), 'num_validation': '{:,}'.format(num_validation), 'num_test': '{:,}'.format(num_test), 'models': model_metadata, 'best_model': best_model_metadata, 'models_training_time': models_training_time, 'total_time': '{:.2f}'.format((time.monotonic() - ti) / 3600) } save_html(model_html, context, 'model.html')
def save_candidate_html(user_data, total_time, from_date, to_date): """ Save user data to an HTML file. Args: user_data (dict): Top and similar artists associated to users. total_time (str): time taken to generate candidate_sets """ date = datetime.utcnow().strftime('%Y-%m-%d') candidate_html = 'Candidate-{}-{}.html'.format(uuid.uuid4(), date) context = { 'user_data': user_data, 'total_time': total_time, 'from_date': from_date, 'to_date': to_date } save_html(candidate_html, context, 'candidate.html')
def save_dataframe_html(users_df_time, recordings_df_time, playcounts_df_time, total_time): """ Prepare and save dataframe HTML. Args: users_df_time (str): Time taken to prepare and save users dataframe. recordings_df_time (str): Time taken to prepare and save recordings dataframe. playcounts_df_time (str): TIme taken to prepare and save playcounts dataframe. total_time (str): Time taken to execute the script. """ date = datetime.utcnow().strftime('%Y-%m-%d') queries_html = 'Queries-{}-{}.html'.format(uuid.uuid4(), date) context = { 'users_df_time': users_df_time, 'recordings_df_time': recordings_df_time, 'playcounts_df_time': playcounts_df_time, 'total_time': total_time } save_html(queries_html, context, 'queries.html')
def save_candidate_html(user_data): """ Save user data to an HTML file. Args: user_data (dict): Dictionary can be depicted as: { 'user_name 1': { 'artists': { 'top_artists 1' : ['similar_artist 1', 'similar_artist 2' ... 'similar_artist x'], ... 'top_artists y' : ['similar_artist 1', 'similar_artist 2' ... 'similar_artist x'], }, 'time' : 'xxx' }, } """ date = datetime.utcnow().strftime('%Y-%m-%d') candidate_html = 'Candidate-{}-{}.html'.format(uuid.uuid4(), date) context = {'user_data': user_data} save_html(candidate_html, context, 'candidate.html')
def get_recommendation_html(recommendations, time_, best_model_id, ti): """ Prepare and save recommendation HTML. Args: time_ (dict): Dictionary containing execution time information, can be depicted as: { 'load_model' : '3.09', ... } best_model_id (str): Id of the model used for generating recommendations ti (str): Seconds since epoch when the script was run. recommendations (dict): Dictionary can be depicted as: { 'user_name 1': { 'time': 'xx.xx', 'top_artists_recordings': [ ('xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx'), ... ('xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx') ] 'similar_artists_recordings' : [ ('xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx'), ... ('xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx') ] } } """ date = datetime.utcnow().strftime('%Y-%m-%d') recommendation_html = 'Recommendation-{}-{}.html'.format( uuid.uuid4(), date) column = ('Track Name', 'Recording MSID', 'Artist Name', 'Artist MSID', 'Release Name', 'Release MSID') context = { 'recommendations': recommendations, 'column': column, 'total_time': '{:.2f}'.format((time() - ti) / 3600), 'time': time_, 'best_model': best_model_id, } save_html(recommendation_html, context, 'recommend.html')
def main(): ti = time() try: listenbrainz_spark.init_spark_session('Create_Dataframe') except AttributeError as err: logging.error( 'Cannot initialize Spark Session: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occurred while initializing Spark session: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) df = None missing_parquets = [] for y in range(config.STARTING_YEAR, config.ENDING_YEAR + 1): for m in range(config.STARTING_MONTH, config.ENDING_MONTH + 1): try: month = listenbrainz_spark.sql_context.read.parquet( '{}/data/listenbrainz/{}/{}.parquet'.format( config.HDFS_CLUSTER_URI, y, m)) df = df.union(month) if df else month except AnalysisException as err: missing_parquets.append('{}-{}'.format(y, '{:02d}'.format(m))) logging.error( 'Cannot read parquet files from HDFS: {} \n {}'.format( type(err).__name__, str(err))) continue except Exception as err: logging.error( 'An error occured while fetching \"/data/listenbrainz/{}/{}.parquet\": {} \n {}. Aborting...' .format(y, m, type(err).__name__, str(err)), exc_info=True) sys.exit(-1) if not df: raise SystemExit("Parquet files from {}-{} to {}-{} are empty".format( config.STARTING_YEAR, '{:02d}'.format(config.STARTING_MONTH), config.ENDING_YEAR, '{:02d}'.format(config.ENDING_MONTH))) logging.info('Registering Dataframe...') table = 'df_to_train_{}'.format( datetime.strftime(datetime.utcnow(), '%Y_%m_%d')) try: df.createOrReplaceTempView(table) except AnalysisException as err: logging.error( 'Cannot register dataframe: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while registering dataframe: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) t = '{:.2f}'.format(time() - ti) logging.info( 'Files fectched from HDFS and dataframe registered in {}s'.format(t)) dest_path = os.path.join(config.HDFS_CLUSTER_URI, 'data', 'listenbrainz', 'recommendation-engine', 'dataframes') logging.info('Preparing users data and saving to HDFS...') try: t0 = time() users_df = prepare_user_data(table) users_df.write.format('parquet').save(dest_path + '/users_df.parquet', mode='overwrite') except QueryExecutionException as err: logging.error( 'Failed to execute users query: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except AnalysisException as err: logging.error( 'Failed to analyse users query plan: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except ParseException as err: logging.error( "Failed to parse SQL command: {} \n {}. Aborting...".format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occurred while executing users query: {} \n {}. Aborting' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) users_df_time = '{:.2f}'.format((time() - t0) / 60) logging.info('Preparing recordings data and saving to HDFS...') try: t0 = time() recordings_df = prepare_recording_data(table) recordings_df.write.format('parquet').save(dest_path + '/recordings_df.parquet', mode='overwrite') except QueryExecutionException as err: logging.error( 'Failed to execute recordings query: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except AnalysisException as err: logging.error( 'Failed to analyse recordings query plan: {} \n {}. Aborting...'. format(type(err).__name__, str(err))) sys.exit(-1) except ParseException as err: logging.error( 'Failed to parse SQL command: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occurred while executing recordings query: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) recordings_df_time = '{:.2f}'.format((time() - t0) / 60) logging.info( 'Preparing listen data dump and playcounts, saving playcounts to HDFS...' ) try: t0 = time() listens_df = prepare_listen_data(table) playcounts_df = get_playcounts_data(listens_df, users_df, recordings_df) playcounts_df.write.format('parquet').save(dest_path + '/playcounts_df.parquet', mode='overwrite') except QueryExecutionException as err: logging.error( 'Failed to execute playcounts query: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except AnalysisException as err: logging.error( 'Failed to analyse playcounts query plan: {} \n {}. Aborting...'. format(type(err).__name__, str(err))) sys.exit(-1) except ParseException as err: logging.error( 'Failed to parse SQL command: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error('An error occurred. {} \n {}. Aborting...'.format( type(err).__name__, str(err)), exc_info=True) sys.exit(-1) playcounts_df_time = '{:.2f}'.format((time() - t0) / 60) total_time = '{:.2f}'.format((time() - ti) / 60) lb_dump_time_window = ('{}-{}'.format( config.STARTING_YEAR, '{:02d}'.format(config.STARTING_MONTH)), '{}-{}'.format( config.ENDING_YEAR, '{:02d}'.format(config.ENDING_MONTH))) date = datetime.utcnow().strftime('%Y-%m-%d') queries_html = 'Queries-{}-{}.html'.format(uuid.uuid4(), date) context = { 'users_df_time': users_df_time, 'recordings_df_time': recordings_df_time, 'playcounts_df_time': playcounts_df_time, 'lb_dump_time_window': lb_dump_time_window, 'missing_parquets': missing_parquets, 'total_time': total_time } utils.save_html(queries_html, context, 'queries.html')
def main(): ti = time() try: listenbrainz_spark.init_spark_session('Train_Models') except AttributeError as err: logging.error( 'Cannot initialize Spark Session: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occurred while initializing Spark session: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) try: path = os.path.join('/', 'data', 'listenbrainz', 'recommendation-engine', 'dataframes', 'playcounts_df.parquet') playcounts_df = listenbrainz_spark.sql_context.read.parquet( config.HDFS_CLUSTER_URI + path) except AnalysisException as err: logging.error('Cannot read parquet file from HDFS: {} \n {}'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while fetching parquet: {} \n {}. Aborting...'. format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) time_info = {} time_info['load_playcounts'] = '{:.2f}'.format((time() - ti) / 60) t0 = time() training_data, validation_data, test_data = preprocess_data(playcounts_df) time_info['preprocessing'] = '{:.2f}'.format((time() - t0) / 60) # Rdds that are used in model training iterative process are cached to improve performance. # Caching large files may cause Out of Memory exception. training_data.persist() validation_data.persist() num_training = training_data.count() num_validation = validation_data.count() num_test = test_data.count() logging.info('Training models...') try: t0 = time() model, model_metadata, best_model_metadata = train( training_data, validation_data, num_validation, config.RANKS, config.LAMBDAS, config.ITERATIONS) models_training_time = '{:.2f}'.format((time() - t0) / 3600) except Py4JJavaError as err: logging.error('Unable to train models: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while training models: {} \n {}. Aborting...'. format(type(err).__name__, str(err), exc_info=True)) sys.exit(-1) training_data.unpersist() validation_data.unpersist() logging.info('Saving model...') try: t0 = time() path = os.path.join('/', 'data', 'listenbrainz', 'recommendation-engine', 'best-model', '{}'.format(best_model_metadata['model_id'])) model.model.save(listenbrainz_spark.context, config.HDFS_CLUSTER_URI + path) time_info['save_model'] = '{:.2f}'.format((time() - t0) / 60) except Py4JJavaError as err: logging.error("Unable to save model: {} \n {}. Aborting...".format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while saving model: {} \n {}. Aborting...'. format(type(err).__name__, str(err), exc_info=True)) sys.exit(-1) date = datetime.utcnow().strftime('%Y-%m-%d') model_html = 'Model-{}-{}.html'.format(uuid.uuid4(), date) context = { 'time': time_info, 'num_training': '{:,}'.format(num_training), 'num_validation': '{:,}'.format(num_validation), 'num_test': '{:,}'.format(num_test), 'models': model_metadata, 'best_model': best_model_metadata, 'models_training_time': models_training_time, 'total_time': '{:.2f}'.format((time() - ti) / 3600) } utils.save_html(model_html, context, 'model.html') path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') with open(path, 'r') as f: recommendation_metadata = json.load(f) recommendation_metadata['best_model_id'] = best_model_metadata[ 'model_id'] with open(path, 'w') as f: json.dump(recommendation_metadata, f)
def main(): ti = time() try: listenbrainz_spark.init_spark_session('Recommendations') except AttributeError as err: logging.error( 'Cannot initialize Spark Session: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occurred while initializing Spark session: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) try: path = os.path.join(config.HDFS_CLUSTER_URI, 'data', 'listenbrainz', 'recommendation-engine', 'dataframes') playcounts_df = listenbrainz_spark.sql_context.read.parquet( path + '/playcounts_df.parquet') users_df = listenbrainz_spark.sql_context.read.parquet( path + '/users_df.parquet') recordings_df = listenbrainz_spark.sql_context.read.parquet( path + '/recordings_df.parquet') except AnalysisException as err: logging.error('Cannot read parquet files from HDFS: {} \n {}'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while fetching parquets: {} \n {}. Aborting...'. format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) time_info = defaultdict(dict) time_info['dataframes'] = '{:.2f}'.format((time() - ti) / 60) try: users_df.createOrReplaceTempView('user') playcounts_df.createOrReplaceTempView('playcount') except AnalysisException as err: logging.error( 'Cannot register dataframes: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while registering dataframes: {} \n {}. Aborting...' .format(type(err).__name__, str(err)), exc_info=True) sys.exit(-1) t0 = time() all_recordings = recordings_df.select('recording_id') all_recordings.persist() all_recordings_count = '{:,}'.format(all_recordings.count()) time_info['all_recordings'] = '{:.2f}'.format((time() - t0) / 60) path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') with open(path, 'r') as f: recommendation_metadata = json.load(f) best_model_id = recommendation_metadata['best_model_id'] best_model_path = os.path.join('/', 'data', 'listenbrainz', 'recommendation-engine', 'best-model', '{}'.format(best_model_id)) logging.info('Loading model...') try: t0 = time() model = load_model(config.HDFS_CLUSTER_URI + best_model_path) time_info['load_model'] = '{:.2f}'.format((time() - t0) / 60) except Py4JJavaError as err: logging.error('Unable to load model: {} \n {}. Aborting...'.format( type(err).__name__, str(err))) sys.exit(-1) except Exception as err: logging.error( 'An error occured while loading model: {} \n {}. Aborting...'. format(type(err).__name__, str(err), exc_info=True)) sys.exit(-1) path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') ts = time() with open(path) as f: recommendation_metadata = json.load(f) recommendations = defaultdict(dict) for user_name in recommendation_metadata['user_name']: try: t0 = time() user_recommendations = recommend_user(user_name, model, all_recordings, recordings_df) user_recommendations['total-time'] = '{:.2f}'.format( (time() - t0) / 60) logging.info( 'Recommendations for "{}" generated'.format(user_name)) recommendations[user_name] = user_recommendations except TypeError as err: logging.error( '{}: Invalid user name. User "{}" does not exist.'.format( type(err).__name__, user_name)) except Exception as err: logging.error( 'Recommendations for "{}" not generated.\n{}'.format( user_name, str(err)), exc_info=True) time_info['total_recommendation_time'] = '{:.2f}'.format( (time() - ts) / 3600) all_recordings.unpersist() date = datetime.utcnow().strftime('%Y-%m-%d') recommendation_html = 'Recommendation-{}-{}.html'.format( uuid.uuid4(), date) column = ('Track Name', 'Recording MSID', 'Artist Name', 'Artist MSID', 'Release Name', 'Release MSID') context = { 'recommendations': recommendations, 'column': column, 'total_time': '{:.2f}'.format((time() - ti) / 3600), 'time': time_info, 'best_model': best_model_id, 'all_recordings_count': all_recordings_count, } utils.save_html(recommendation_html, context, 'recommend.html')