def __init__(self): hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) try: listenbrainz_spark.init_spark_session('uploader') except SparkSessionNotInitializedException as err: logger.error(str(err), exc_info=True) sys.exit(-1)
def setUpClass(cls): listenbrainz_spark.init_test_session('spark-test-run-{}'.format(str(uuid.uuid4()))) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) cls.app = utils.create_app() cls.app_context = cls.app.app_context() cls.date = datetime(2019, 1, 21) cls.app_context.push()
def main(mlhd_dir): hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) for mlhd_file in os.listdir(mlhd_dir): if mlhd_file.endswith('.avro'): print('Uploading ', mlhd_file) hdfs_connection.client.upload( hdfs_path=os.path.join(MLHD_DATA_PATH, mlhd_file), local_path=os.path.join(mlhd_dir, mlhd_file)) print('Done')
def init_dir(rm, recursive, create_dir): """ Create directories in HDFS to run the recommendation engine. """ try: listenbrainz_spark.init_spark_session('Manage Directories') except Py4JJavaError as err: logging.error('{}\n{}\nAborting...'.format(str(err), err.java_exception)) sys.exit(-1) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) if rm: try: utils.delete_dir(path.RECOMMENDATION_PARENT_DIR) utils.delete_dir(path.CHECKPOINT_DIR) logging.info('Successfully deleted directories.') except HdfsError as err: logging.error( '{}: Some/all directories are non-empty. Try "--recursive" to delete recursively.' .format(type(err).__name__)) logging.warning( 'Deleting directory recursively will delete all the recommendation data.' ) sys.exit(-1) if recursive: try: utils.delete_dir(path.RECOMMENDATION_PARENT_DIR, recursive=True) utils.delete_dir(path.CHECKPOINT_DIR, recursive=True) logging.info('Successfully deleted directories recursively.') except HdfsError as err: logging.error( '{}: An error occurred while deleting directories recursively.\n{}\nAborting...' .format(type(err).__name__, str(err))) sys.exit(-1) if create_dir: try: logging.info('Creating directory to store dataframes...') utils.create_dir(path.DATAFRAME_DIR) logging.info('Creating directory to store models...') utils.create_dir(path.MODEL_DIR) logging.info('Creating directory to store candidate sets...') utils.create_dir(path.CANDIDATE_SET_DIR) logging.info('Creating directory to store RDD checkpoints...') utils.create_dir(path.CHECKPOINT_DIR) print('Done!') except HdfsError as err: logging.error( '{}: An error occured while creating some/more directories.\n{}\nAborting...' .format(type(err).__name__, str(err))) sys.exit(-1)
def get_result(self, request): try: query = request['query'] params = request.get('params', {}) except Exception: logger.error('Bad query sent to spark request consumer: %s', json.dumps(request), exc_info=True) return None logger.info('Query: %s', query) logger.info('Params: %s', str(params)) try: query_handler = listenbrainz_spark.query_map.get_query_handler( query) except KeyError: logger.error("Bad query sent to spark request consumer: %s", query, exc_info=True) return None except Exception as e: logger.error("Error while mapping query to function: %s", str(e), exc_info=True) return None try: # initialize connection to HDFS, the request consumer is a long running process # so we try to create a connection everytime before executing a query to avoid # affecting subsequent queries in case there's an intermittent connection issue hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) return query_handler(**params) except TypeError as e: logger.error( "TypeError in the query handler for query '%s', maybe bad params. Error: %s", query, str(e), exc_info=True) return None except Exception as e: logger.error("Error in the query handler for query '%s': %s", query, str(e), exc_info=True) return None
def main(ranks=None, lambdas=None, iterations=None, alpha=None): if ranks is None: current_app.logger.critical('model param "ranks" missing') if lambdas is None: current_app.logger.critical('model param "lambdas" missing') raise if iterations is None: current_app.logger.critical('model param "iterations" missing') raise if alpha is None: current_app.logger.critical('model param "alpha" missing') raise ti = time.monotonic() time_ = defaultdict(dict) try: listenbrainz_spark.init_spark_session('Train Models') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) raise # Add checkpoint dir to break and save RDD lineage. listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI + path.CHECKPOINT_DIR) try: playcounts_df = utils.read_files_from_HDFS( path.PLAYCOUNTS_DATAFRAME_PATH) dataframe_metadata_df = utils.read_files_from_HDFS( path.DATAFRAME_METADATA) except PathNotFoundException as err: current_app.logger.error( '{}\nConsider running create_dataframes.py'.format(str(err)), exc_info=True) raise except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) raise time_['load_playcounts'] = '{:.2f}'.format((time.monotonic() - ti) / 60) t0 = time.monotonic() training_data, validation_data, test_data = preprocess_data(playcounts_df) time_['preprocessing'] = '{:.2f}'.format((time.monotonic() - t0) / 60) # An action must be called for persist to evaluate. num_training = training_data.count() num_validation = validation_data.count() num_test = test_data.count() t0 = time.monotonic() best_model, model_metadata = get_best_model(training_data, validation_data, num_validation, ranks, lambdas, iterations, alpha) models_training_time = '{:.2f}'.format((time.monotonic() - t0) / 3600) best_model_metadata = get_best_model_metadata(best_model) current_app.logger.info( "Calculating test RMSE for best model with model id: {}".format( best_model.model_id)) best_model_metadata['test_rmse'] = compute_rmse(best_model.model, test_data, num_test, best_model.model_id) current_app.logger.info("Test RMSE calculated!") best_model_metadata['training_data_count'] = num_training best_model_metadata['validation_data_count'] = num_validation best_model_metadata['test_data_count'] = num_test best_model_metadata['dataframe_id'] = get_latest_dataframe_id( dataframe_metadata_df) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) t0 = time.monotonic() save_model(best_model.model_id, best_model.model) time_['save_model'] = '{:.2f}'.format((time.monotonic() - t0) / 60) save_model_metadata_to_hdfs(best_model_metadata) # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway. try: utils.delete_dir(path.CHECKPOINT_DIR, recursive=True) except HDFSDirectoryNotDeletedException as err: current_app.logger.error(str(err), exc_info=True) raise if SAVE_TRAINING_HTML: current_app.logger.info('Saving HTML...') save_training_html(time_, num_training, num_validation, num_test, model_metadata, best_model_metadata, ti, models_training_time) current_app.logger.info('Done!') message = [{ 'type': 'cf_recording_model', 'model_upload_time': str(datetime.utcnow()), 'total_time': '{:.2f}'.format(time.monotonic() - ti), }] return message
def setUpClass(cls): listenbrainz_spark.init_test_session('spark-test-run-{}'.format( str(uuid.uuid4()))) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) cls.date = datetime(2019, 1, 21)
def main(): ti = time() time_ = defaultdict(dict) try: listenbrainz_spark.init_spark_session('Train Models') except SparkSessionNotInitializedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) # Add checkpoint dir to break and save RDD lineage. listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI + path.CHECKPOINT_DIR) try: playcounts_df = utils.read_files_from_HDFS( path.PLAYCOUNTS_DATAFRAME_PATH) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) time_['load_playcounts'] = '{:.2f}'.format((time() - ti) / 60) t0 = time() training_data, validation_data, test_data = preprocess_data(playcounts_df) time_['preprocessing'] = '{:.2f}'.format((time() - t0) / 60) # Rdds that are used in model training iterative process are cached to improve performance. # Caching large files may cause Out of Memory exception. training_data.persist() validation_data.persist() # An action must be called for persist to evaluate. num_training = training_data.count() num_validation = validation_data.count() num_test = test_data.count() current_app.logger.info('Training models...') t0 = time() model, model_metadata, best_model_metadata = train( training_data, validation_data, num_validation, config.RANKS, config.LAMBDAS, config.ITERATIONS) models_training_time = '{:.2f}'.format((time() - t0) / 3600) try: best_model_test_rmse = compute_rmse(model.model, test_data, num_test) except Py4JJavaError as err: current_app.logger.error( 'Root mean squared error for best model for test data not computed\n{}\nAborting...' .format(str(err.java_exception)), exc_info=True) sys.exit(-1) # Cached data must be cleared to avoid OOM. training_data.unpersist() validation_data.unpersist() current_app.logger.info('Saving model...') t0 = time() model_save_path = os.path.join(path.DATA_DIR, best_model_metadata['model_id']) save_model(model_save_path, best_model_metadata['model_id'], model) time_['save_model'] = '{:.2f}'.format((time() - t0) / 60) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway. try: utils.delete_dir(path.CHECKPOINT_DIR, recursive=True) except HDFSDirectoryNotDeletedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) if SAVE_TRAINING_HTML: save_training_html(time_, num_training, num_validation, num_test, model_metadata, best_model_metadata, ti, models_training_time) # Save best model id to a JSON file metadata_file_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'recommendation-metadata.json') with open(metadata_file_path, 'r') as f: recommendation_metadata = json.load(f) recommendation_metadata['best_model_id'] = best_model_metadata[ 'model_id'] with open(metadata_file_path, 'w') as f: json.dump(recommendation_metadata, f)
def setUpClass(cls) -> None: listenbrainz_spark.init_test_session(f"spark-test-run-{uuid.uuid4()}") hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) cls.uploader = ListenbrainzDataUploader()
def main(app_name, archive): listenbrainz_spark.init_spark_session(app_name) hdfs_connection.init_hdfs(config.HDFS_HTTP_URI) print('Copying extracted dump to HDFS...') copy_to_hdfs(archive) print('Done!')