Esempio n. 1
0
    def tearDown(self):
        """ Delete the parquet file stored to ensure that the tests are independant. """
        path_found = path_exists(self.path_)
        if path_found:
            delete_dir(self.path_, recursive=True)

        return super().tearDown()
Esempio n. 2
0
def delete_model():
    """ Delete model.
        Note: At any point in time, only one model is in HDFS
    """
    dir_exists = utils.path_exists(path.DATA_DIR)
    if dir_exists:
        utils.delete_dir(path.DATA_DIR, recursive=True)
Esempio n. 3
0
    def test_get_latest_full_dump_file_missing(self):
        """ Test to ensure 'None' is returned if metadata file is missing. """
        path_found = path_exists(self.path_)
        if path_found:
            delete_dir(self.path_, recursive=True)

        self.assertIsNone(import_utils.get_latest_full_dump())
Esempio n. 4
0
    def test_search_dump_file_missing(self):
        """ Test to ensure 'False' is returned if metadata file is missing. """
        path_found = path_exists(self.path_)
        if path_found:
            delete_dir(self.path_, recursive=True)

        self.assertFalse(
            import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
Esempio n. 5
0
    def test_path_exists(self):
        path_ = '/tests/test'
        utils.create_dir(path_)

        status = utils.path_exists(path_)
        self.assertTrue(status)
        utils.delete_dir(path_)
        status = utils.path_exists(path_)
        self.assertFalse(status)
    def test_insert_dump_data_file_missing(self):
        """ Test to ensure a file is created if it is missing. """
        path_found = path_exists(self.path_)
        if path_found:
            delete_dir(self.path_, recursive=True)

        self.assertFalse(import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
        import_utils.insert_dump_data(1, "full", datetime.fromtimestamp(1))
        self.assertTrue(import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
Esempio n. 7
0
 def test_rename(self):
     utils.create_dir(self.path_)
     test_exists = utils.path_exists(self.path_)
     self.assertTrue(test_exists)
     utils.rename(self.path_, self.temp_path_)
     test_exists = utils.path_exists(self.path_)
     self.assertFalse(test_exists)
     temp_exists = utils.path_exists(self.temp_path_)
     self.assertTrue(temp_exists)
     utils.delete_dir(self.temp_path_)
Esempio n. 8
0
    def test_get_latest_full_dump_no_full(self):
        """ Test to ensure 'None' is returned if not full import has been made. """
        # Remove full dump entries from parquet
        import_meta_df = read_files_from_HDFS(self.path_)
        result = import_meta_df.filter(import_meta_df.dump_type != "full")

        # We have to save the dataframe as a different file and move it as the df itself is read from the file
        save_parquet(result, '/temp.parquet')
        delete_dir(self.path_, recursive=True)
        rename('/temp.parquet', self.path_)

        self.assertIsNone(import_utils.get_latest_full_dump())
Esempio n. 9
0
    def test_upload_archive(self):
        archive_path = self.create_test_tar()
        pxz = self.uploader.get_pxz_output(archive_path)
        tmp_dump_dir = tempfile.mkdtemp()

        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            self.uploader.upload_archive(tmp_dump_dir, tar, '/artist_relations.parquet',
                                         schema.artist_relation_schema, self.uploader.process_json)

        df = utils.read_files_from_HDFS('/artist_relations.parquet')
        self.assertEqual(df.count(), 1)

        status = utils.path_exists(tmp_dump_dir)
        self.assertFalse(status)

        utils.delete_dir('/artist_relations.parquet', recursive=True)
Esempio n. 10
0
def init_dir(rm, recursive, create_dir):
    """ Create directories in HDFS to run the recommendation engine.
    """
    try:
        listenbrainz_spark.init_spark_session('Manage Directories')
    except Py4JJavaError as err:
        logging.error('{}\n{}\nAborting...'.format(str(err),
                                                   err.java_exception))
        sys.exit(-1)

    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    if rm:
        try:
            utils.delete_dir(path.RECOMMENDATION_PARENT_DIR)
            utils.delete_dir(path.CHECKPOINT_DIR)
            logging.info('Successfully deleted directories.')
        except HdfsError as err:
            logging.error(
                '{}: Some/all directories are non-empty. Try "--recursive" to delete recursively.'
                .format(type(err).__name__))
            logging.warning(
                'Deleting directory recursively will delete all the recommendation data.'
            )
            sys.exit(-1)

    if recursive:
        try:
            utils.delete_dir(path.RECOMMENDATION_PARENT_DIR, recursive=True)
            utils.delete_dir(path.CHECKPOINT_DIR, recursive=True)
            logging.info('Successfully deleted directories recursively.')
        except HdfsError as err:
            logging.error(
                '{}: An error occurred while deleting directories recursively.\n{}\nAborting...'
                .format(type(err).__name__, str(err)))
            sys.exit(-1)

    if create_dir:
        try:
            logging.info('Creating directory to store dataframes...')
            utils.create_dir(path.DATAFRAME_DIR)

            logging.info('Creating directory to store models...')
            utils.create_dir(path.MODEL_DIR)

            logging.info('Creating directory to store candidate sets...')
            utils.create_dir(path.CANDIDATE_SET_DIR)

            logging.info('Creating directory to store RDD checkpoints...')
            utils.create_dir(path.CHECKPOINT_DIR)

            print('Done!')
        except HdfsError as err:
            logging.error(
                '{}: An error occured while creating some/more directories.\n{}\nAborting...'
                .format(type(err).__name__, str(err)))
            sys.exit(-1)
Esempio n. 11
0
    def extract_and_upload_archive(self,
                                   archive,
                                   local_dir,
                                   hdfs_dir,
                                   cleanup_on_failure=True):
        """
        Extract the archive and upload it to the given hdfs directory.
        Args:
            archive: path to the tar archive to uploaded
            local_dir: path to local dir to be used for extraction
            hdfs_dir: path to hdfs dir where contents of tar should be uploaded
            cleanup_on_failure: whether to delete local and hdfs directories
                if error occurs during extraction
        """
        total_files = 0
        total_time = 0.0
        with tarfile.open(archive, mode='r') as tar:
            for member in tar:
                if member.isfile() and member.name.endswith(".parquet"):
                    logger.info(f"Uploading {member.name}...")
                    t0 = time.monotonic()

                    try:
                        tar.extract(member, path=local_dir)
                    except tarfile.TarError as err:
                        if cleanup_on_failure:
                            if utils.path_exists(hdfs_dir):
                                utils.delete_dir(hdfs_dir, recursive=True)
                            shutil.rmtree(local_dir, ignore_errors=True)
                        raise DumpInvalidException(
                            f"{type(err).__name__} while extracting {member.name}, aborting import"
                        )

                    hdfs_path = os.path.join(hdfs_dir, member.name)
                    local_path = os.path.join(local_dir, member.name)
                    utils.upload_to_HDFS(hdfs_path, local_path)

                    time_taken = time.monotonic() - t0
                    total_files += 1
                    total_time += time_taken
                    logger.info(
                        f"Done! Current file processed in {time_taken:.2f} sec"
                    )
        logger.info(
            f"Done! Total files processed {total_files}. Average time taken: {total_time / total_files:.2f}"
        )
Esempio n. 12
0
    def test_upload_archive(self):
        archive_path = self.create_test_tar()
        pxz = ListenbrainzHDFSUploader().get_pxz_output(archive_path)
        tmp_dump_dir = tempfile.mkdtemp()

        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            ListenbrainzHDFSUploader().upload_archive(tmp_dump_dir, tar, '/test', schema.listen_schema,
            ListenbrainzDataUploader().process_json_listens)

        walk = utils.hdfs_walk('/test', depth=1)
        dirs = next(walk)[1]
        self.assertEqual(len(dirs), 1)

        df = utils.read_files_from_HDFS('/test/2020/1.parquet')
        self.assertEqual(df.count(), 1)

        status = utils.path_exists(tmp_dump_dir)
        self.assertFalse(status)

        utils.delete_dir('/test', recursive=True)
Esempio n. 13
0
    def upload_new_listens_incremental_dump(self, archive: str):
        """ Upload new format parquet listens of an incremental
         dump to HDFS.
            Args:
                archive: path to parquet listens dump to be uploaded
        """
        # upload parquet file to temporary path so that we can
        # read it in spark in next step
        hdfs_path = self.upload_archive_to_temp(archive)

        # read the parquet file from the temporary path and append
        # it to incremental.parquet for permanent storage
        read_files_from_HDFS(hdfs_path) \
            .repartition(1) \
            .write \
            .mode("append") \
            .parquet(INCREMENTAL_DUMPS_SAVE_PATH)

        # delete parquet from hdfs temporary path
        utils.delete_dir(hdfs_path, recursive=True)
Esempio n. 14
0
def insert_dump_data(dump_id: int, dump_type: str, imported_at: datetime):
    """ Insert information about dump imported """
    import_meta_df = None
    try:
        import_meta_df = read_files_from_HDFS(IMPORT_METADATA)
    except PathNotFoundException:
        current_app.logger.info("Import metadata file not found, creating...")

    data = create_dataframe(Row(dump_id, dump_type, imported_at), schema=import_metadata_schema)
    if import_meta_df:
        result = import_meta_df \
            .filter(f"dump_id != '{dump_id}' OR dump_type != '{dump_type}'") \
            .union(data)
    else:
        result = data

    # We have to save the dataframe as a different file and move it as the df itself is read from the file
    save_parquet(result, "/temp.parquet")
    if path_exists(IMPORT_METADATA):
        delete_dir(IMPORT_METADATA, recursive=True)
    rename("/temp.parquet", IMPORT_METADATA)
Esempio n. 15
0
    def upload_archive(self, tmp_dump_dir, tar, dest_path, schema, callback=None, force=False):
        """ Upload data dump to HDFS.

            Args:
                tmp_dump_dir (str): Path to temporary directory to upload JSON.
                tar: Uncompressed tar object.
                dest_path (str): HDFS path to upload data dump.
                schema: Schema of parquet to be uploaded.
                callback: Function to process JSON files.
                force: If True deletes dir at dest_path
        """
        if callback is None:
            raise NotImplementedError('Callback to process JSON missing. Aboritng...')

        if force:
            current_app.logger.info('Removing {} from HDFS...'.format(dest_path))
            utils.delete_dir(dest_path, recursive=True)
            current_app.logger.info('Done!')

        file_count = 0
        total_time = 0.0
        for member in tar:
            if member.isfile() and self._is_json_file(member.name):
                current_app.logger.info('Loading {}...'.format(member.name))
                t = time.time()
                tar.extract(member)
                tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name)
                utils.upload_to_HDFS(tmp_hdfs_path, member.name)
                callback(member.name, dest_path, tmp_hdfs_path, schema)
                utils.delete_dir(tmp_hdfs_path, recursive=True)
                os.remove(member.name)
                file_count += 1
                time_taken = time.time() - t
                current_app.logger.info("Done! Processed {} files. Current file done in {:.2f} sec".format(
                    file_count, time_taken))
                total_time += time_taken
                average_time = total_time / file_count
                current_app.logger.info("Total time: {:.2f}, average time: {:.2f}".format(total_time, average_time))
        utils.delete_dir(tmp_dump_dir, recursive=True)
        shutil.rmtree(tmp_dump_dir)
Esempio n. 16
0
def main(ranks=None, lambdas=None, iterations=None, alpha=None):

    if ranks is None:
        current_app.logger.critical('model param "ranks" missing')

    if lambdas is None:
        current_app.logger.critical('model param "lambdas" missing')
        raise

    if iterations is None:
        current_app.logger.critical('model param "iterations" missing')
        raise

    if alpha is None:
        current_app.logger.critical('model param "alpha" missing')
        raise

    ti = time.monotonic()
    time_ = defaultdict(dict)
    try:
        listenbrainz_spark.init_spark_session('Train Models')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    # Add checkpoint dir to break and save RDD lineage.
    listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI +
                                                path.CHECKPOINT_DIR)

    try:
        playcounts_df = utils.read_files_from_HDFS(
            path.PLAYCOUNTS_DATAFRAME_PATH)
        dataframe_metadata_df = utils.read_files_from_HDFS(
            path.DATAFRAME_METADATA)
    except PathNotFoundException as err:
        current_app.logger.error(
            '{}\nConsider running create_dataframes.py'.format(str(err)),
            exc_info=True)
        raise
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    time_['load_playcounts'] = '{:.2f}'.format((time.monotonic() - ti) / 60)

    t0 = time.monotonic()
    training_data, validation_data, test_data = preprocess_data(playcounts_df)
    time_['preprocessing'] = '{:.2f}'.format((time.monotonic() - t0) / 60)

    # An action must be called for persist to evaluate.
    num_training = training_data.count()
    num_validation = validation_data.count()
    num_test = test_data.count()

    t0 = time.monotonic()
    best_model, model_metadata = get_best_model(training_data, validation_data,
                                                num_validation, ranks, lambdas,
                                                iterations, alpha)
    models_training_time = '{:.2f}'.format((time.monotonic() - t0) / 3600)

    best_model_metadata = get_best_model_metadata(best_model)
    current_app.logger.info(
        "Calculating test RMSE for best model with model id: {}".format(
            best_model.model_id))
    best_model_metadata['test_rmse'] = compute_rmse(best_model.model,
                                                    test_data, num_test,
                                                    best_model.model_id)
    current_app.logger.info("Test RMSE calculated!")

    best_model_metadata['training_data_count'] = num_training
    best_model_metadata['validation_data_count'] = num_validation
    best_model_metadata['test_data_count'] = num_test
    best_model_metadata['dataframe_id'] = get_latest_dataframe_id(
        dataframe_metadata_df)

    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    t0 = time.monotonic()
    save_model(best_model.model_id, best_model.model)
    time_['save_model'] = '{:.2f}'.format((time.monotonic() - t0) / 60)

    save_model_metadata_to_hdfs(best_model_metadata)
    # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway.
    try:
        utils.delete_dir(path.CHECKPOINT_DIR, recursive=True)
    except HDFSDirectoryNotDeletedException as err:
        current_app.logger.error(str(err), exc_info=True)
        raise

    if SAVE_TRAINING_HTML:
        current_app.logger.info('Saving HTML...')
        save_training_html(time_, num_training, num_validation, num_test,
                           model_metadata, best_model_metadata, ti,
                           models_training_time)
        current_app.logger.info('Done!')

    message = [{
        'type': 'cf_recording_model',
        'model_upload_time': str(datetime.utcnow()),
        'total_time': '{:.2f}'.format(time.monotonic() - ti),
    }]

    return message
Esempio n. 17
0
 def tearDown(self):
     path_found = utils.path_exists(self.path_)
     if path_found:
         utils.delete_dir(self.path_, recursive=True)
Esempio n. 18
0
 def delete_uploaded_listens():
     if utils.path_exists(LISTENBRAINZ_NEW_DATA_DIRECTORY):
         utils.delete_dir(LISTENBRAINZ_NEW_DATA_DIRECTORY, recursive=True)
Esempio n. 19
0
 def delete_dir(cls):
     walk = utils.hdfs_walk('/', depth=1)
     # dirs in '/'
     dirs = next(walk)[1]
     for directory in dirs:
         utils.delete_dir(os.path.join('/', directory), recursive=True)
def main():
    ti = time()
    time_ = defaultdict(dict)
    try:
        listenbrainz_spark.init_spark_session('Train Models')
    except SparkSessionNotInitializedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    # Add checkpoint dir to break and save RDD lineage.
    listenbrainz_spark.context.setCheckpointDir(config.HDFS_CLUSTER_URI +
                                                path.CHECKPOINT_DIR)

    try:
        playcounts_df = utils.read_files_from_HDFS(
            path.PLAYCOUNTS_DATAFRAME_PATH)
    except FileNotFetchedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)
    time_['load_playcounts'] = '{:.2f}'.format((time() - ti) / 60)

    t0 = time()
    training_data, validation_data, test_data = preprocess_data(playcounts_df)
    time_['preprocessing'] = '{:.2f}'.format((time() - t0) / 60)

    # Rdds that are used in model training iterative process are cached to improve performance.
    # Caching large files may cause Out of Memory exception.
    training_data.persist()
    validation_data.persist()

    # An action must be called for persist to evaluate.
    num_training = training_data.count()
    num_validation = validation_data.count()
    num_test = test_data.count()

    current_app.logger.info('Training models...')
    t0 = time()
    model, model_metadata, best_model_metadata = train(
        training_data, validation_data, num_validation, config.RANKS,
        config.LAMBDAS, config.ITERATIONS)
    models_training_time = '{:.2f}'.format((time() - t0) / 3600)

    try:
        best_model_test_rmse = compute_rmse(model.model, test_data, num_test)
    except Py4JJavaError as err:
        current_app.logger.error(
            'Root mean squared error for best model for test data not computed\n{}\nAborting...'
            .format(str(err.java_exception)),
            exc_info=True)
        sys.exit(-1)

    # Cached data must be cleared to avoid OOM.
    training_data.unpersist()
    validation_data.unpersist()

    current_app.logger.info('Saving model...')
    t0 = time()
    model_save_path = os.path.join(path.DATA_DIR,
                                   best_model_metadata['model_id'])
    save_model(model_save_path, best_model_metadata['model_id'], model)
    time_['save_model'] = '{:.2f}'.format((time() - t0) / 60)

    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    # Delete checkpoint dir as saved lineages would eat up space, we won't be using them anyway.
    try:
        utils.delete_dir(path.CHECKPOINT_DIR, recursive=True)
    except HDFSDirectoryNotDeletedException as err:
        current_app.logger.error(str(err), exc_info=True)
        sys.exit(-1)

    if SAVE_TRAINING_HTML:
        save_training_html(time_, num_training, num_validation, num_test,
                           model_metadata, best_model_metadata, ti,
                           models_training_time)

    # Save best model id to a JSON file
    metadata_file_path = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        'recommendation-metadata.json')
    with open(metadata_file_path, 'r') as f:
        recommendation_metadata = json.load(f)
        recommendation_metadata['best_model_id'] = best_model_metadata[
            'model_id']
    with open(metadata_file_path, 'w') as f:
        json.dump(recommendation_metadata, f)
Esempio n. 21
0
 def test_delete_dir(self):
     utils.create_dir(self.path_)
     utils.delete_dir(self.path_)
     status = utils.path_exists(self.path_)
     self.assertFalse(status)
Esempio n. 22
0
    def tearDown(self):
        if utils.path_exists(self.path_):
            utils.delete_dir(self.path_, recursive=True)

        if utils.path_exists(self.temp_path_):
            utils.delete_dir(self.temp_path_, recursive=True)
Esempio n. 23
0
    def upload_archive(self,
                       tmp_dump_dir,
                       tar,
                       dest_path,
                       schema,
                       callback=None,
                       overwrite=False):
        """ Upload data dump to HDFS.

            Args:
                tmp_dump_dir (str): Path to temporary directory to upload JSON.
                tar: Uncompressed tar object.
                dest_path (str): HDFS path to upload data dump.
                schema: Schema of parquet to be uploaded.
                callback: Function to process JSON files.
                overwrite: If True deletes dir at dest_path
        """
        if callback is None:
            raise NotImplementedError(
                'Callback to process JSON missing. Aborting...')

        # Delete TEMP_DIR_PATH if it exists
        if utils.path_exists(TEMP_DIR_PATH):
            utils.delete_dir(TEMP_DIR_PATH, recursive=True)

        # Copy data from dest_path to TEMP_DIR_PATH to be merged with new data
        if not overwrite and utils.path_exists(dest_path):
            t0 = time.monotonic()
            logger.info("Copying old listens into '{}'".format(TEMP_DIR_PATH))
            utils.copy(dest_path, TEMP_DIR_PATH, overwrite=True)
            logger.info("Done! Time taken: {:.2f}".format(time.monotonic() -
                                                          t0))

        logger.info("Uploading listens to temporary directory in HDFS...")
        total_files = 0
        total_time = 0.0
        for member in tar:
            if member.isfile() and self._is_json_file(member.name):
                logger.info("Uploading {}...".format(member.name))
                t0 = time.monotonic()

                try:
                    tar.extract(member)
                except TarError as err:
                    # Cleanup
                    if utils.path_exists(TEMP_DIR_PATH):
                        utils.delete_dir(TEMP_DIR_PATH, recursive=True)
                    if utils.path_exists(tmp_dump_dir):
                        utils.delete_dir(tmp_dump_dir, recursive=True)
                    raise DumpInvalidException(
                        "{} while extracting {}, aborting import".format(
                            type(err).__name__, member.name))

                tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name)
                utils.upload_to_HDFS(tmp_hdfs_path, member.name)
                callback(member.name, TEMP_DIR_PATH, tmp_hdfs_path,
                         not overwrite, schema)
                utils.delete_dir(tmp_hdfs_path, recursive=True)
                os.remove(member.name)
                time_taken = time.monotonic() - t0
                total_files += 1
                total_time += time_taken
                logger.info(
                    "Done! Current file processed in {:.2f} sec".format(
                        time_taken))
        logger.info(
            "Done! Total files processed {}. Average time taken: {:.2f}".
            format(total_files, total_time / total_files))

        # Delete dest_path if present
        if utils.path_exists(dest_path):
            logger.info('Removing {} from HDFS...'.format(dest_path))
            utils.delete_dir(dest_path, recursive=True)
            logger.info('Done!')

        logger.info("Moving the processed files to {}".format(dest_path))
        t0 = time.monotonic()

        # Check if parent directory exists, if not create a directory
        dest_path_parent = pathlib.Path(dest_path).parent
        if not utils.path_exists(dest_path_parent):
            utils.create_dir(dest_path_parent)

        utils.rename(TEMP_DIR_PATH, dest_path)
        utils.logger.info("Done! Time taken: {:.2f}".format(time.monotonic() -
                                                            t0))

        # Cleanup
        utils.delete_dir(tmp_dump_dir, recursive=True)