Beispiel #1
0
 def __init__(self):
     hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
     try:
         listenbrainz_spark.init_spark_session('uploader')
     except SparkSessionNotInitializedException as err:
         logger.error(str(err), exc_info=True)
         sys.exit(-1)
Beispiel #2
0
 def setUpClass(cls):
     listenbrainz_spark.init_test_session('spark-test-run-{}'.format(str(uuid.uuid4())))
     hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
     cls.app = utils.create_app()
     cls.app_context = cls.app.app_context()
     cls.date = datetime(2019, 1, 21)
     cls.app_context.push()
def main(mlhd_dir):
    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    for mlhd_file in os.listdir(mlhd_dir):
        if mlhd_file.endswith('.avro'):
            print('Uploading ', mlhd_file)
            hdfs_connection.client.upload(
                hdfs_path=os.path.join(MLHD_DATA_PATH, mlhd_file),
                local_path=os.path.join(mlhd_dir, mlhd_file))
            print('Done')
def init_dir(rm, recursive, create_dir):
    """ Create directories in HDFS to run the recommendation engine.
    """
    try:
        listenbrainz_spark.init_spark_session('Manage Directories')
    except Py4JJavaError as err:
        logging.error('{}\n{}\nAborting...'.format(str(err),
                                                   err.java_exception))
        sys.exit(-1)

    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    if rm:
        try:
            utils.delete_dir(path.RECOMMENDATION_PARENT_DIR)
            utils.delete_dir(path.CHECKPOINT_DIR)
            logging.info('Successfully deleted directories.')
        except HdfsError as err:
            logging.error(
                '{}: Some/all directories are non-empty. Try "--recursive" to delete recursively.'
                .format(type(err).__name__))
            logging.warning(
                'Deleting directory recursively will delete all the recommendation data.'
            )
            sys.exit(-1)

    if recursive:
        try:
            utils.delete_dir(path.RECOMMENDATION_PARENT_DIR, recursive=True)
            utils.delete_dir(path.CHECKPOINT_DIR, recursive=True)
            logging.info('Successfully deleted directories recursively.')
        except HdfsError as err:
            logging.error(
                '{}: An error occurred while deleting directories recursively.\n{}\nAborting...'
                .format(type(err).__name__, str(err)))
            sys.exit(-1)

    if create_dir:
        try:
            logging.info('Creating directory to store dataframes...')
            utils.create_dir(path.DATAFRAME_DIR)

            logging.info('Creating directory to store models...')
            utils.create_dir(path.MODEL_DIR)

            logging.info('Creating directory to store candidate sets...')
            utils.create_dir(path.CANDIDATE_SET_DIR)

            logging.info('Creating directory to store RDD checkpoints...')
            utils.create_dir(path.CHECKPOINT_DIR)

            print('Done!')
        except HdfsError as err:
            logging.error(
                '{}: An error occured while creating some/more directories.\n{}\nAborting...'
                .format(type(err).__name__, str(err)))
            sys.exit(-1)
    def get_result(self, request):
        try:
            query = request['query']
            params = request.get('params', {})
        except Exception:
            logger.error('Bad query sent to spark request consumer: %s',
                         json.dumps(request),
                         exc_info=True)
            return None

        logger.info('Query: %s', query)
        logger.info('Params: %s', str(params))

        try:
            query_handler = listenbrainz_spark.query_map.get_query_handler(
                query)
        except KeyError:
            logger.error("Bad query sent to spark request consumer: %s",
                         query,
                         exc_info=True)
            return None
        except Exception as e:
            logger.error("Error while mapping query to function: %s",
                         str(e),
                         exc_info=True)
            return None

        try:
            # initialize connection to HDFS, the request consumer is a long running process
            # so we try to create a connection everytime before executing a query to avoid
            # affecting subsequent queries in case there's an intermittent connection issue
            hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
            return query_handler(**params)
        except TypeError as e:
            logger.error(
                "TypeError in the query handler for query '%s', maybe bad params. Error: %s",
                query,
                str(e),
                exc_info=True)
            return None
        except Exception as e:
            logger.error("Error in the query handler for query '%s': %s",
                         query,
                         str(e),
                         exc_info=True)
            return None
def main(ranks=None, lambdas=None, iterations=None, alpha=None):

    if ranks is None:
        current_app.logger.critical('model param "ranks" missing')

    if lambdas is None:
        current_app.logger.critical('model param "lambdas" missing')
        raise

    if iterations is None:
        current_app.logger.critical('model param "iterations" missing')
        raise

    if alpha is None:
        current_app.logger.critical('model param "alpha" missing')
        raise

    ti = time.monotonic()
    time_ = defaultdict(dict)
    try:
        listenbrainz_spark.init_spark_session('Train Models')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    # Add checkpoint dir to break and save RDD lineage.
    listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI +
                                                path.CHECKPOINT_DIR)

    try:
        playcounts_df = utils.read_files_from_HDFS(
            path.PLAYCOUNTS_DATAFRAME_PATH)
        dataframe_metadata_df = utils.read_files_from_HDFS(
            path.DATAFRAME_METADATA)
    except PathNotFoundException as err:
        current_app.logger.error(
            '{}\nConsider running create_dataframes.py'.format(str(err)),
            exc_info=True)
        raise
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    time_['load_playcounts'] = '{:.2f}'.format((time.monotonic() - ti) / 60)

    t0 = time.monotonic()
    training_data, validation_data, test_data = preprocess_data(playcounts_df)
    time_['preprocessing'] = '{:.2f}'.format((time.monotonic() - t0) / 60)

    # An action must be called for persist to evaluate.
    num_training = training_data.count()
    num_validation = validation_data.count()
    num_test = test_data.count()

    t0 = time.monotonic()
    best_model, model_metadata = get_best_model(training_data, validation_data,
                                                num_validation, ranks, lambdas,
                                                iterations, alpha)
    models_training_time = '{:.2f}'.format((time.monotonic() - t0) / 3600)

    best_model_metadata = get_best_model_metadata(best_model)
    current_app.logger.info(
        "Calculating test RMSE for best model with model id: {}".format(
            best_model.model_id))
    best_model_metadata['test_rmse'] = compute_rmse(best_model.model,
                                                    test_data, num_test,
                                                    best_model.model_id)
    current_app.logger.info("Test RMSE calculated!")

    best_model_metadata['training_data_count'] = num_training
    best_model_metadata['validation_data_count'] = num_validation
    best_model_metadata['test_data_count'] = num_test
    best_model_metadata['dataframe_id'] = get_latest_dataframe_id(
        dataframe_metadata_df)

    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    t0 = time.monotonic()
    save_model(best_model.model_id, best_model.model)
    time_['save_model'] = '{:.2f}'.format((time.monotonic() - t0) / 60)

    save_model_metadata_to_hdfs(best_model_metadata)
    # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway.
    try:
        utils.delete_dir(path.CHECKPOINT_DIR, recursive=True)
    except HDFSDirectoryNotDeletedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    if SAVE_TRAINING_HTML:
        current_app.logger.info('Saving HTML...')
        save_training_html(time_, num_training, num_validation, num_test,
                           model_metadata, best_model_metadata, ti,
                           models_training_time)
        current_app.logger.info('Done!')

    message = [{
        'type': 'cf_recording_model',
        'model_upload_time': str(datetime.utcnow()),
        'total_time': '{:.2f}'.format(time.monotonic() - ti),
    }]

    return message
Beispiel #7
0
 def setUpClass(cls):
     listenbrainz_spark.init_test_session('spark-test-run-{}'.format(
         str(uuid.uuid4())))
     hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
     cls.date = datetime(2019, 1, 21)
def main():
    ti = time()
    time_ = defaultdict(dict)
    try:
        listenbrainz_spark.init_spark_session('Train Models')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    # Add checkpoint dir to break and save RDD lineage.
    listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI +
                                                path.CHECKPOINT_DIR)

    try:
        playcounts_df = utils.read_files_from_HDFS(
            path.PLAYCOUNTS_DATAFRAME_PATH)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    time_['load_playcounts'] = '{:.2f}'.format((time() - ti) / 60)

    t0 = time()
    training_data, validation_data, test_data = preprocess_data(playcounts_df)
    time_['preprocessing'] = '{:.2f}'.format((time() - t0) / 60)

    # Rdds that are used in model training iterative process are cached to improve performance.
    # Caching large files may cause Out of Memory exception.
    training_data.persist()
    validation_data.persist()

    # An action must be called for persist to evaluate.
    num_training = training_data.count()
    num_validation = validation_data.count()
    num_test = test_data.count()

    current_app.logger.info('Training models...')
    t0 = time()
    model, model_metadata, best_model_metadata = train(
        training_data, validation_data, num_validation, config.RANKS,
        config.LAMBDAS, config.ITERATIONS)
    models_training_time = '{:.2f}'.format((time() - t0) / 3600)

    try:
        best_model_test_rmse = compute_rmse(model.model, test_data, num_test)
    except Py4JJavaError as err:
        current_app.logger.error(
            'Root mean squared error for best model for test data not computed\n{}\nAborting...'
            .format(str(err.java_exception)),
            exc_info=True)
        sys.exit(-1)

    # Cached data must be cleared to avoid OOM.
    training_data.unpersist()
    validation_data.unpersist()

    current_app.logger.info('Saving model...')
    t0 = time()
    model_save_path = os.path.join(path.DATA_DIR,
                                   best_model_metadata['model_id'])
    save_model(model_save_path, best_model_metadata['model_id'], model)
    time_['save_model'] = '{:.2f}'.format((time() - t0) / 60)

    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway.
    try:
        utils.delete_dir(path.CHECKPOINT_DIR, recursive=True)
    except HDFSDirectoryNotDeletedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    if SAVE_TRAINING_HTML:
        save_training_html(time_, num_training, num_validation, num_test,
                           model_metadata, best_model_metadata, ti,
                           models_training_time)

    # Save best model id to a JSON file
    metadata_file_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        'recommendation-metadata.json')
    with open(metadata_file_path, 'r') as f:
        recommendation_metadata = json.load(f)
        recommendation_metadata['best_model_id'] = best_model_metadata[
            'model_id']
    with open(metadata_file_path, 'w') as f:
        json.dump(recommendation_metadata, f)
Beispiel #9
0
 def setUpClass(cls) -> None:
     listenbrainz_spark.init_test_session(f"spark-test-run-{uuid.uuid4()}")
     hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
     cls.uploader = ListenbrainzDataUploader()
Beispiel #10
0
def main(app_name, archive):
    listenbrainz_spark.init_spark_session(app_name)
    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    print('Copying extracted dump to HDFS...')
    copy_to_hdfs(archive)
    print('Done!')