Ejemplo n.º 1
0
def import_newest_incremental_dump_handler():
    imported_dumps = []
    latest_full_dump = utils.get_latest_full_dump()
    if latest_full_dump is None:
        # If no prior full dump is present, just import the lates incremental dump
        imported_dumps.append(
            import_dump_to_hdfs('incremental', overwrite=False))
        current_app.logger.warn(
            "No previous full dump found, importing latest incremental dump",
            exc_info=True)
    else:
        # Import all missing dumps from last full dump import
        dump_id = latest_full_dump["dump_id"] + 1
        imported_at = latest_full_dump["imported_at"]
        while True:
            if not utils.search_dump(dump_id, 'incremental', imported_at):
                try:
                    imported_dumps.append(
                        import_dump_to_hdfs('incremental', False, dump_id))
                except DumpNotFoundException:
                    break
                except Exception as e:
                    # Exit if any other error occurs during import
                    current_app.logger.error(
                        f"Error while importing incremental dump with ID {dump_id}: {e}",
                        exc_info=True)
                    break
            dump_id += 1
    return [{
        'type': 'import_incremental_dump',
        'imported_dump': imported_dumps,
        'time': str(datetime.utcnow()),
    }]
Ejemplo n.º 2
0
 def test_get_latest_full_dump_present(self):
     """ Test to ensure correct dump is returned if full dump has been imported. """
     self.assertDictEqual(import_utils.get_latest_full_dump(), {
         "dump_id": 7,
         "dump_type": "full",
         "imported_at": datetime.fromtimestamp(7)
     })
Ejemplo n.º 3
0
    def test_get_latest_full_dump_file_missing(self):
        """ Test to ensure 'None' is returned if metadata file is missing. """
        path_found = path_exists(self.path_)
        if path_found:
            delete_dir(self.path_, recursive=True)

        self.assertIsNone(import_utils.get_latest_full_dump())
Ejemplo n.º 4
0
def import_newest_incremental_dump_handler():
    errors = []
    imported_dumps = []
    latest_full_dump = utils.get_latest_full_dump()
    if latest_full_dump is None:
        # If no prior full dump is present, just import the latest incremental dump
        imported_dumps.append(import_incremental_dump_to_hdfs(dump_id=None))

        error_msg = "No previous full dump found, importing latest incremental dump"
        errors.append(error_msg)
        logger.warning(error_msg, exc_info=True)
    else:
        # Import all missing dumps from last full dump import
        start_id = latest_full_dump["dump_id"] + 1
        imported_at = latest_full_dump["imported_at"]
        end_id = ListenbrainzDataDownloader().get_latest_dump_id(DumpType.INCREMENTAL) + 1

        for dump_id in range(start_id, end_id, 1):
            if not utils.search_dump(dump_id, DumpType.INCREMENTAL, imported_at):
                try:
                    imported_dumps.append(import_incremental_dump_to_hdfs(dump_id))
                except Exception as e:
                    # Skip current dump if any error occurs during import
                    error_msg = f"Error while importing incremental dump with ID {dump_id}: {e}"
                    errors.append(error_msg)
                    logger.error(error_msg, exc_info=True)
                    continue
            dump_id += 1
            request_consumer.rc.ping()
    return [{
        'type': 'import_incremental_dump',
        'imported_dump': imported_dumps,
        'errors': errors,
        'time': str(datetime.utcnow()),
    }]
Ejemplo n.º 5
0
    def test_get_latest_full_dump_no_full(self):
        """ Test to ensure 'None' is returned if not full import has been made. """
        # Remove full dump entries from parquet
        import_meta_df = read_files_from_HDFS(self.path_)
        result = import_meta_df.filter(import_meta_df.dump_type != "full")

        # We have to save the dataframe as a different file and move it as the df itself is read from the file
        save_parquet(result, '/temp.parquet')
        delete_dir(self.path_, recursive=True)
        rename('/temp.parquet', self.path_)

        self.assertIsNone(import_utils.get_latest_full_dump())