Example #1
0
 def test_rename(self):
     utils.create_dir(self.path_)
     test_exists = utils.path_exists(self.path_)
     self.assertTrue(test_exists)
     utils.rename(self.path_, '/temp')
     test_exists = utils.path_exists(self.path_)
     self.assertFalse(test_exists)
     temp_exists = utils.path_exists('/temp')
     self.assertTrue(temp_exists)
     utils.delete_dir('/temp')
Example #2
0
def init_dir(rm, recursive, create_dir):
    """ Create directories in HDFS to run the recommendation engine.
    """
    try:
        listenbrainz_spark.init_spark_session('Manage Directories')
    except Py4JJavaError as err:
        logging.error('{}\n{}\nAborting...'.format(str(err),
                                                   err.java_exception))
        sys.exit(-1)

    hdfs_connection.init_hdfs(config.HDFS_HTTP_URI)
    if rm:
        try:
            utils.delete_dir(path.RECOMMENDATION_PARENT_DIR)
            utils.delete_dir(path.CHECKPOINT_DIR)
            logging.info('Successfully deleted directories.')
        except HdfsError as err:
            logging.error(
                '{}: Some/all directories are non-empty. Try "--recursive" to delete recursively.'
                .format(type(err).__name__))
            logging.warning(
                'Deleting directory recursively will delete all the recommendation data.'
            )
            sys.exit(-1)

    if recursive:
        try:
            utils.delete_dir(path.RECOMMENDATION_PARENT_DIR, recursive=True)
            utils.delete_dir(path.CHECKPOINT_DIR, recursive=True)
            logging.info('Successfully deleted directories recursively.')
        except HdfsError as err:
            logging.error(
                '{}: An error occurred while deleting directories recursively.\n{}\nAborting...'
                .format(type(err).__name__, str(err)))
            sys.exit(-1)

    if create_dir:
        try:
            logging.info('Creating directory to store dataframes...')
            utils.create_dir(path.DATAFRAME_DIR)

            logging.info('Creating directory to store models...')
            utils.create_dir(path.MODEL_DIR)

            logging.info('Creating directory to store candidate sets...')
            utils.create_dir(path.CANDIDATE_SET_DIR)

            logging.info('Creating directory to store RDD checkpoints...')
            utils.create_dir(path.CHECKPOINT_DIR)

            print('Done!')
        except HdfsError as err:
            logging.error(
                '{}: An error occured while creating some/more directories.\n{}\nAborting...'
                .format(type(err).__name__, str(err)))
            sys.exit(-1)
Example #3
0
    def test_copy(self):
        # Test directories
        utils.create_dir(self.path_)
        utils.create_dir(os.path.join(self.path_, "a"))
        utils.create_dir(os.path.join(self.path_, "b"))

        # DataFrames to create parquets
        df_a = utils.create_dataframe([Row(column1=1, column2=2)], schema=None)
        df_b = utils.create_dataframe([Row(column1=3, column2=4)], schema=None)
        df_c = utils.create_dataframe([Row(column1=5, column2=6)], schema=None)

        # Save DataFrames in respective directories
        utils.save_parquet(df_a, os.path.join(self.path_, "a", "df_a.parquet"))
        utils.save_parquet(df_b, os.path.join(self.path_, "b", "df_b.parquet"))
        utils.save_parquet(df_c, os.path.join(self.path_, "df_c.parquet"))

        utils.copy(self.path_, self.temp_path_, overwrite=True)

        # Read copied DataFrame
        cp_df_a = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "a", "df_a.parquet"))
        cp_df_b = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "b", "df_b.parquet"))
        cp_df_c = utils.read_files_from_HDFS(os.path.join(self.temp_path_, "df_c.parquet"))

        # Check if both DataFrames are same
        self.assertListEqual(df_a.rdd.map(list).collect(), cp_df_a.rdd.map(list).collect())
        self.assertListEqual(df_b.rdd.map(list).collect(), cp_df_b.rdd.map(list).collect())
        self.assertListEqual(df_c.rdd.map(list).collect(), cp_df_c.rdd.map(list).collect())
 def test_path_exists(self):
     utils.create_dir(self.path_)
     status = utils.path_exists(self.path_)
     self.assertTrue(status)
 def test_delete_dir(self):
     utils.create_dir(self.path_)
     utils.delete_dir(self.path_)
     status = utils.path_exists(self.path_)
     self.assertFalse(status)
Example #6
0
    def upload_archive(self,
                       tmp_dump_dir,
                       tar,
                       dest_path,
                       schema,
                       callback=None,
                       overwrite=False):
        """ Upload data dump to HDFS.

            Args:
                tmp_dump_dir (str): Path to temporary directory to upload JSON.
                tar: Uncompressed tar object.
                dest_path (str): HDFS path to upload data dump.
                schema: Schema of parquet to be uploaded.
                callback: Function to process JSON files.
                overwrite: If True deletes dir at dest_path
        """
        if callback is None:
            raise NotImplementedError(
                'Callback to process JSON missing. Aborting...')

        # Delete TEMP_DIR_PATH if it exists
        if utils.path_exists(TEMP_DIR_PATH):
            utils.delete_dir(TEMP_DIR_PATH, recursive=True)

        # Copy data from dest_path to TEMP_DIR_PATH to be merged with new data
        if not overwrite and utils.path_exists(dest_path):
            t0 = time.monotonic()
            logger.info("Copying old listens into '{}'".format(TEMP_DIR_PATH))
            utils.copy(dest_path, TEMP_DIR_PATH, overwrite=True)
            logger.info("Done! Time taken: {:.2f}".format(time.monotonic() -
                                                          t0))

        logger.info("Uploading listens to temporary directory in HDFS...")
        total_files = 0
        total_time = 0.0
        for member in tar:
            if member.isfile() and self._is_json_file(member.name):
                logger.info("Uploading {}...".format(member.name))
                t0 = time.monotonic()

                try:
                    tar.extract(member)
                except TarError as err:
                    # Cleanup
                    if utils.path_exists(TEMP_DIR_PATH):
                        utils.delete_dir(TEMP_DIR_PATH, recursive=True)
                    if utils.path_exists(tmp_dump_dir):
                        utils.delete_dir(tmp_dump_dir, recursive=True)
                    raise DumpInvalidException(
                        "{} while extracting {}, aborting import".format(
                            type(err).__name__, member.name))

                tmp_hdfs_path = os.path.join(tmp_dump_dir, member.name)
                utils.upload_to_HDFS(tmp_hdfs_path, member.name)
                callback(member.name, TEMP_DIR_PATH, tmp_hdfs_path,
                         not overwrite, schema)
                utils.delete_dir(tmp_hdfs_path, recursive=True)
                os.remove(member.name)
                time_taken = time.monotonic() - t0
                total_files += 1
                total_time += time_taken
                logger.info(
                    "Done! Current file processed in {:.2f} sec".format(
                        time_taken))
        logger.info(
            "Done! Total files processed {}. Average time taken: {:.2f}".
            format(total_files, total_time / total_files))

        # Delete dest_path if present
        if utils.path_exists(dest_path):
            logger.info('Removing {} from HDFS...'.format(dest_path))
            utils.delete_dir(dest_path, recursive=True)
            logger.info('Done!')

        logger.info("Moving the processed files to {}".format(dest_path))
        t0 = time.monotonic()

        # Check if parent directory exists, if not create a directory
        dest_path_parent = pathlib.Path(dest_path).parent
        if not utils.path_exists(dest_path_parent):
            utils.create_dir(dest_path_parent)

        utils.rename(TEMP_DIR_PATH, dest_path)
        utils.logger.info("Done! Time taken: {:.2f}".format(time.monotonic() -
                                                            t0))

        # Cleanup
        utils.delete_dir(tmp_dump_dir, recursive=True)
Example #7
0
 def test_create_dir(self):
     path_ = '/tests/test'
     utils.create_dir(path_)
     status = utils.path_exists(path_)
     self.assertTrue(status)