Esempio n. 1
0
def import_incremental_dump_to_hdfs(dump_id: int = None) -> str:
    """ Import the incremental dump with the given dump_id if specified otherwise the
     latest incremental dump.

    Notes:
        All incremental dumps are stored together in incremental.parquet inside the
        listens directory.
    Args:
        dump_id: id of the incremental dump to be imported
    Returns:
        the name of the imported dump
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        src, dump_name, dump_id = ListenbrainzDataDownloader().download_listens(
            directory=temp_dir,
            dump_type=DumpType.INCREMENTAL,
            listens_dump_id=dump_id
        )

        # instantiating ListenbrainzDataUploader creates a spark session which
        # is a bit non-intuitive.
        # FIXME in future to make initializing of spark session more explicit?
        ListenbrainzDataUploader().upload_new_listens_incremental_dump(src)
    utils.insert_dump_data(dump_id, DumpType.INCREMENTAL, datetime.utcnow())
    return dump_name
Esempio n. 2
0
 def test_insert_dump_data(self):
     """ Test to ensure that data is inserted correctly. """
     import_utils.insert_dump_data(9, DumpType.FULL,
                                   datetime.fromtimestamp(9))
     self.assertTrue(
         import_utils.search_dump(9, DumpType.FULL,
                                  datetime.fromtimestamp(9)))
Esempio n. 3
0
def import_dump_to_hdfs(dump_type, overwrite, dump_id=None):
    temp_dir = tempfile.mkdtemp()
    dump_type = 'incremental' if dump_type == 'incremental' else 'full'
    src, dump_name, dump_id = ListenbrainzDataDownloader().download_listens(
        directory=temp_dir, dump_type=dump_type, listens_dump_id=dump_id)
    ListenbrainzDataUploader().upload_listens(src, overwrite=overwrite)
    utils.insert_dump_data(dump_id, dump_type, datetime.utcnow())
    shutil.rmtree(temp_dir)
    return dump_name
    def test_insert_dump_data_file_missing(self):
        """ Test to ensure a file is created if it is missing. """
        path_found = path_exists(self.path_)
        if path_found:
            delete_dir(self.path_, recursive=True)

        self.assertFalse(import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
        import_utils.insert_dump_data(1, "full", datetime.fromtimestamp(1))
        self.assertTrue(import_utils.search_dump(1, "full", datetime.fromtimestamp(1)))
Esempio n. 5
0
 def test_insert_dump_data_update_date(self):
     """ Test to ensure date is updated if entry already exists. """
     self.assertFalse(
         import_utils.search_dump(7, "incremental",
                                  datetime.fromtimestamp(9)))
     import_utils.insert_dump_data(7, "incremental",
                                   datetime.fromtimestamp(9))
     self.assertTrue(
         import_utils.search_dump(7, "incremental",
                                  datetime.fromtimestamp(9)))
Esempio n. 6
0
 def test_insert_dump_data_update_date(self):
     """ Test to ensure date is updated if entry already exists. """
     self.assertFalse(
         import_utils.search_dump(7, DumpType.INCREMENTAL,
                                  datetime.fromtimestamp(9)))
     import_utils.insert_dump_data(7, DumpType.INCREMENTAL,
                                   datetime.fromtimestamp(9))
     self.assertTrue(
         import_utils.search_dump(7, DumpType.INCREMENTAL,
                                  datetime.fromtimestamp(9)))
     self.assertTrue(
         import_utils.search_dump(2, DumpType.INCREMENTAL,
                                  datetime.fromtimestamp(2)))
Esempio n. 7
0
def import_full_dump_to_hdfs(dump_id: int = None) -> str:
    """ Import the full dump with the given dump_id if specified otherwise the
     latest full dump.

    Notes:
        Deletes all the existing listens and uploads listens from new dump.
    Args:
        dump_id: id of the full dump to be imported
    Returns:
        the name of the imported dump
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        src, dump_name, dump_id = ListenbrainzDataDownloader(
        ).download_listens(directory=temp_dir,
                           dump_type=DumpType.FULL,
                           listens_dump_id=dump_id)
        ListenbrainzDataUploader().upload_new_listens_full_dump(src)
    utils.insert_dump_data(dump_id, DumpType.FULL, datetime.utcnow())
    return dump_name