def create(location, threads): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression """ app = create_app() with app.app_context(): ls = init_influx_connection( current_app.logger, { 'REDIS_HOST': current_app.config['REDIS_HOST'], 'REDIS_PORT': current_app.config['REDIS_PORT'], 'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'], 'INFLUX_HOST': current_app.config['INFLUX_HOST'], 'INFLUX_PORT': current_app.config['INFLUX_PORT'], 'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'], }) time_now = datetime.today() dump_path = os.path.join( location, 'listenbrainz-dump-{time}'.format( time=time_now.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, time_now, threads) ls.dump_listens(dump_path, time_now, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create(location, threads): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression """ db.init_db_connection(config.SQLALCHEMY_DATABASE_URI) ls = init_influx_connection(log, { 'REDIS_HOST': config.REDIS_HOST, 'REDIS_PORT': config.REDIS_PORT, 'REDIS_NAMESPACE': config.REDIS_NAMESPACE, 'INFLUX_HOST': config.INFLUX_HOST, 'INFLUX_PORT': config.INFLUX_PORT, 'INFLUX_DB_NAME': config.INFLUX_DB_NAME, }) time_now = datetime.today() dump_path = os.path.join(location, 'listenbrainz-dump-{time}'.format(time=time_now.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, time_now, threads) ls.dump_listens(dump_path, time_now, threads) try: write_hashes(dump_path) except IOError as e: log.error('Unable to create hash files! Error: %s', str(e)) return log.info('Dumps created and hashes written at %s' % dump_path)
def create(location, threads): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression """ app = create_app() with app.app_context(): ls = init_influx_connection(current_app.logger, { 'REDIS_HOST': current_app.config['REDIS_HOST'], 'REDIS_PORT': current_app.config['REDIS_PORT'], 'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'], 'INFLUX_HOST': current_app.config['INFLUX_HOST'], 'INFLUX_PORT': current_app.config['INFLUX_PORT'], 'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'], }) time_now = datetime.today() dump_path = os.path.join(location, 'listenbrainz-dump-{time}'.format(time=time_now.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, time_now, threads) ls.dump_listens(dump_path, time_now, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create_incremental(location, threads, dump_id): app = create_app() with app.app_context(): from listenbrainz.webserver.influx_connection import _influx as ls if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found, exiting!", dump_id) sys.exit(-1) end_time = dump_entry['created'] prev_dump_entry = db_dump.get_dump_entry(dump_id - 1) if prev_dump_entry is None: # incremental dumps must have a previous dump in the series current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id) sys.exit(-1) start_time = prev_dump_entry['created'] current_app.logger.info("Dumping data from %s to %s", start_time, end_time) dump_path = os.path.join(location, 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads, spark_format=False) ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads, spark_format=True) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create_spark_dump(location, threads): with create_app().app_context(): ls = init_influx_connection( current_app.logger, { 'REDIS_HOST': current_app.config['REDIS_HOST'], 'REDIS_PORT': current_app.config['REDIS_PORT'], 'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'], 'INFLUX_HOST': current_app.config['INFLUX_HOST'], 'INFLUX_PORT': current_app.config['INFLUX_PORT'], 'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'], }) time_now = datetime.today() dump_path = os.path.join( location, 'listenbrainz-spark-dump-{time}'.format( time=time_now.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) ls.dump_listens(dump_path, time_now, threads, spark_format=True) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dump created and hash written at %s', dump_path)
def write_listens_to_dump(self, listens_path, users, tar, archive_name, start_time, end_time): """ Write listens into the ListenBrainz dump. Args: listens_path (str): the path where listens should be kept before adding to the archive users (List[dict]): a list of all users tar (TarFile obj): the tar obj to which listens should be added archive_name (str): the name of the archive start_time and end_time: the range of time for which listens are to be dumped """ dump_complete = False next_user_id = 0 index = {} while not dump_complete: file_uuid = str(uuid.uuid4()) file_name = file_uuid + '.listens' # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid) file_directory = os.path.join(file_name[0], file_name[0:2]) tmp_directory = os.path.join(listens_path, file_directory) create_path(tmp_directory) tmp_file_path = os.path.join(tmp_directory, file_name) archive_file_path = os.path.join(archive_name, 'listens', file_directory, file_name) with open(tmp_file_path, 'w') as f: file_done = False while next_user_id < len(users): if f.tell() > DUMP_FILE_SIZE_LIMIT: file_done = True break username = users[next_user_id]['musicbrainz_id'] offset = f.tell() size = self.dump_user(username=username, fileobj=f, start_time=start_time, end_time=end_time) index[username] = { 'file_name': file_uuid, 'offset': offset, 'size': size, } next_user_id += 1 self.log.info("%d users done. Total: %d", next_user_id, len(users)) if file_done: tar.add(tmp_file_path, arcname=archive_file_path) os.remove(tmp_file_path) continue if next_user_id == len(users): if not file_done: # if this was the last user and file hasn't been added, add it tar.add(tmp_file_path, arcname=archive_file_path) os.remove(tmp_file_path) dump_complete = True break return index
def dump_listens(self, location, dump_id, start_time=datetime.utcfromtimestamp(0), end_time=None, threads=DUMP_DEFAULT_THREAD_COUNT): """ Dumps all listens in the ListenStore into a .tar.xz archive. Files are created with UUIDs as names. Each file can contain listens for a number of users. An index.json file is used to save which file contains the listens of which users. This creates an incremental dump if start_time is specified (with range start_time to end_time), otherwise it creates a full dump with all listens. Args: location: the directory where the listens dump archive should be created dump_id (int): the ID of the dump in the dump sequence start_time and end_time (datetime): the time range for which listens should be dumped start_time defaults to utc 0 (meaning a full dump) and end_time defaults to the current time threads (int): the number of threads to use for compression spark_format (bool): dump files in Apache Spark friendly format if True, else full dumps Returns: the path to the dump archive """ if end_time is None: end_time = datetime.now() self.log.info('Beginning dump of listens from TimescaleDB...') full_dump = bool(start_time == datetime.utcfromtimestamp(0)) archive_name = 'listenbrainz-listens-dump-{dump_id}-{time}'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) if full_dump: archive_name = '{}-full'.format(archive_name) else: archive_name = '{}-incremental'.format(archive_name) archive_path = os.path.join(location, '{filename}.tar.xz'.format(filename=archive_name)) with open(archive_path, 'w') as archive: pxz_command = ['pxz', '--compress', '-T{threads}'.format(threads=threads)] pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = os.path.join(self.dump_temp_dir_root, str(uuid.uuid4())) create_path(temp_dir) self.write_dump_metadata(archive_name, start_time, end_time, temp_dir, tar, full_dump) listens_path = os.path.join(temp_dir, 'listens') self.write_listens(listens_path, tar, archive_name, start_time, end_time) # remove the temporary directory shutil.rmtree(temp_dir) pxz.stdin.close() pxz.wait() self.log.info('ListenBrainz listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path
def create_full(location, threads, dump_id, last_dump_id): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from InfluxDB Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table """ app = create_app() with app.app_context(): from listenbrainz.webserver.influx_connection import _influx as ls if last_dump_id: all_dumps = db_dump.get_dump_entries() if len(all_dumps) == 0: current_app.logger.error( "Cannot create full dump with last dump's ID, no dump exists!" ) sys.exit(-1) dump_id = all_dumps[0]['id'] if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] dump_path = os.path.join( location, 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) db_dump.dump_postgres_db(dump_path, end_time, threads) ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads, spark_format=False) ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads, spark_format=True) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create_spark_dump(location, threads): with create_app().app_context(): from listenbrainz.webserver.influx_connection import _influx as ls time_now = datetime.today() dump_path = os.path.join(location, 'listenbrainz-spark-dump-{time}'.format(time=time_now.strftime('%Y%m%d-%H%M%S'))) create_path(dump_path) ls.dump_listens(dump_path, time_now, threads, spark_format=True) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return current_app.logger.info('Dump created and hash written at %s', dump_path)
def test_cleanup_dumps(self): create_path(os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000001')) create_path(os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000002')) create_path(os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000003')) create_path(os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000004')) create_path(os.path.join(self.tempdir, 'not-a-dump')) dump_manager._cleanup_dumps(self.tempdir) newdirs = os.listdir(self.tempdir) self.assertNotIn('listenbrainz-dump-20180312-000001', newdirs) self.assertNotIn('listenbrainz-dump-20180312-000002', newdirs) self.assertIn('listenbrainz-dump-20180312-000003', newdirs) self.assertIn('listenbrainz-dump-20180312-000003', newdirs) self.assertIn('not-a-dump', newdirs)
def write_spark_listens_to_disk(self, unwritten_listens, temp_dir): for year in unwritten_listens: for month in unwritten_listens[year]: if year < 2002: directory = temp_dir filename = os.path.join(directory, 'invalid.json') else: directory = os.path.join(temp_dir, str(year)) filename = os.path.join(directory, '{}.json'.format(str(month))) create_path(directory) with open(filename, 'a') as f: f.write('\n'.join([ ujson.dumps(listen) for listen in unwritten_listens[year][month] ])) f.write('\n')
def write_incremental_listens_to_disk(self, listens, temp_dir): """ Write all spark listens in year/month dir format to disk. Args: listens : the listens to be written into the disk temp_dir: the dir into which listens should be written """ for year in listens: for month in listens[year]: if year < 2002: directory = temp_dir filename = os.path.join(directory, 'invalid.json') else: directory = os.path.join(temp_dir, str(year)) filename = os.path.join(directory, '{}.json'.format(str(month))) create_path(directory) with open(filename, 'a') as f: f.write('\n'.join([ujson.dumps(listen) for listen in listens[year][month]])) f.write('\n')
def test_cleanup_dumps(self): create_path( os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000001')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000002')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000003')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-20180312-000004')) create_path(os.path.join(self.tempdir, 'not-a-dump')) dump_manager._cleanup_dumps(self.tempdir) newdirs = os.listdir(self.tempdir) self.assertNotIn('listenbrainz-dump-20180312-000001', newdirs) self.assertNotIn('listenbrainz-dump-20180312-000002', newdirs) self.assertIn('listenbrainz-dump-20180312-000003', newdirs) self.assertIn('listenbrainz-dump-20180312-000003', newdirs) self.assertIn('not-a-dump', newdirs)
def create_feedback(location, threads): """ Create a spark formatted dump of user/recommendation feedback data. Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression """ app = create_app() with app.app_context(): end_time = datetime.now() ts = end_time.strftime('%Y%m%d-%H%M%S') dump_name = 'listenbrainz-feedback-{time}-full'.format(time=ts) dump_path = os.path.join(location, dump_name) create_path(dump_path) db_dump.dump_feedback_for_spark(dump_path, end_time, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: if not sanity_check_dumps(dump_path, 3): sys.exit(-1) except OSError as e: sys.exit(-1) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'feedback') # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s 0 feedback\n" % (end_time.strftime('%Y%m%d-%H%M%S'))) current_app.logger.info( 'Feedback dump created and hashes written at %s' % dump_path) sys.exit(0)
def write_listens_full(self, listens_path, users, dump_time): dump_complete = False next_user_id = 0 index = {} while not dump_complete: file_name = str(uuid.uuid4()) # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid) directory = os.path.join(listens_path, file_name[0], file_name[0:2]) create_path(directory) file_path = os.path.join(directory, '{uuid}.listens'.format(uuid=file_name)) with open(file_path, 'w') as f: file_done = False while next_user_id < len(users): if f.tell() > DUMP_FILE_SIZE_LIMIT: file_done = True break username = users[next_user_id]['musicbrainz_id'] offset = f.tell() size = self.dump_user(username=username, fileobj=f, dump_time=dump_time) index[username] = { 'file_name': file_name, 'offset': offset, 'size': size, } next_user_id += 1 self.log.info("%d users done. Total: %d", next_user_id, len(users)) if file_done: continue if next_user_id == len(users): dump_complete = True break return index
def create_incremental(location, threads, dump_id): app = create_app() with app.app_context(): from listenbrainz.webserver.timescale_connection import _ts as ls if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found, exiting!", dump_id) sys.exit(-1) end_time = dump_entry['created'] prev_dump_entry = db_dump.get_dump_entry(dump_id - 1) if prev_dump_entry is None: # incremental dumps must have a previous dump in the series current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id) sys.exit(-1) start_time = prev_dump_entry['created'] current_app.logger.info("Dumping data from %s to %s", start_time, end_time) dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) dump_path = os.path.join(location, dump_name) create_path(dump_path) listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads) spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-incremental.tar.xz'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) spark_dump_path = os.path.join(location, dump_path, spark_dump_file) transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) return # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'incremental') current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def _create_dump(location, dump_type, tables, dump_time, threads=DUMP_DEFAULT_THREAD_COUNT): """ Creates a dump of the provided tables at the location passed Arguments: location: the path where the dump should be created dump_type: the type of data dump being made - private or public tables: a dict containing the names of the tables to be dumped as keys and the columns to be dumped as values dump_time: the time at which the dump process was started threads: the maximum number of threads to use for compression Returns: the path to the archive file created """ archive_name = 'listenbrainz-{dump_type}-dump-{time}'.format( dump_type=dump_type, time=dump_time.strftime('%Y%m%d-%H%M%S')) archive_path = os.path.join( location, '{archive_name}.tar.xz'.format(archive_name=archive_name, )) with open(archive_path, 'w') as archive: pxz_command = [ 'pxz', '--compress', '-T{threads}'.format(threads=threads) ] pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = tempfile.mkdtemp() try: schema_seq_path = os.path.join(temp_dir, "SCHEMA_SEQUENCE") with open(schema_seq_path, "w") as f: f.write(str(db.SCHEMA_VERSION)) tar.add(schema_seq_path, arcname=os.path.join(archive_name, "SCHEMA_SEQUENCE")) timestamp_path = os.path.join(temp_dir, "TIMESTAMP") with open(timestamp_path, "w") as f: f.write(dump_time.isoformat(" ")) tar.add(timestamp_path, arcname=os.path.join(archive_name, "TIMESTAMP")) tar.add(DUMP_LICENSE_FILE_PATH, arcname=os.path.join(archive_name, "COPYING")) except IOError as e: current_app.logger.error( 'IOError while adding dump metadata: %s', str(e), exc_info=True) raise except Exception as e: current_app.logger.error( 'Exception while adding dump metadata: %s', str(e), exc_info=True) raise archive_tables_dir = os.path.join(temp_dir, 'lbdump', 'lbdump') create_path(archive_tables_dir) with db.engine.connect() as connection: if dump_type == "feedback": dump_user_feedback(connection, location=archive_tables_dir) else: with connection.begin() as transaction: cursor = connection.connection.cursor() for table in tables: try: copy_table( cursor=cursor, location=archive_tables_dir, columns=','.join(tables[table]), table_name=table, ) except IOError as e: current_app.logger.error( 'IOError while copying table %s', table, exc_info=True) raise except Exception as e: current_app.logger.error( 'Error while copying table %s: %s', table, str(e), exc_info=True) raise transaction.rollback() tar.add(archive_tables_dir, arcname=os.path.join(archive_name, 'lbdump'.format(dump_type))) shutil.rmtree(temp_dir) pxz.stdin.close() pxz.wait() return archive_path
def dump_listens(self, location, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT): """ Dumps all listens in the ListenStore into a .tar.xz archive. Files are created with UUIDs as names. Each file can contain listens for a number of users. An index.json file is used to save which file contains the listens of which users. Args: location: the directory where the listens dump archive should be created dump_time (datetime): the time at which the data dump was started threads (int): the number of threads to user for compression Returns: the path to the dump archive """ self.log.info('Beginning dump of listens from InfluxDB...') self.log.info('Getting list of users whose listens are to be dumped...') users = db_user.get_all_users(columns=['id', 'musicbrainz_id']) self.log.info('Total number of users: %d', len(users)) archive_name = 'listenbrainz-listens-dump-{time}'.format(time=dump_time.strftime('%Y%m%d-%H%M%S')) archive_path = os.path.join(location, '{filename}.tar.xz'.format(filename=archive_name)) with open(archive_path, 'w') as archive: pxz_command = ['pxz', '--compress', '-T{threads}'.format(threads=threads)] pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = tempfile.mkdtemp() try: # add timestamp timestamp_path = os.path.join(temp_dir, 'TIMESTAMP') with open(timestamp_path, 'w') as f: f.write(dump_time.isoformat(' ')) tar.add(timestamp_path, arcname=os.path.join(archive_name, 'TIMESTAMP')) # add schema version schema_version_path = os.path.join(temp_dir, 'SCHEMA_SEQUENCE') with open(schema_version_path, 'w') as f: f.write(str(LISTENS_DUMP_SCHEMA_VERSION)) tar.add(schema_version_path, arcname=os.path.join(archive_name, 'SCHEMA_SEQUENCE')) # add copyright notice tar.add(DUMP_LICENSE_FILE_PATH, arcname=os.path.join(archive_name, 'COPYING')) except IOError as e: self.log.critical('IOError while writing metadata dump files: %s', str(e), exc_info=True) raise except Exception as e: self.log.error('Exception while adding dump metadata: %s', str(e), exc_info=True) raise listens_path = os.path.join(temp_dir, 'listens') dump_complete = False next_user_id = 0 index = {} while not dump_complete: file_name = str(uuid.uuid4()) # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid) directory = os.path.join(listens_path, file_name[0], file_name[0:2]) create_path(directory) file_path = os.path.join(directory, '{uuid}.listens'.format(uuid=file_name)) with open(file_path, 'w') as f: file_done = False while next_user_id < len(users): if f.tell() > DUMP_FILE_SIZE_LIMIT: file_done = True break username = users[next_user_id]['musicbrainz_id'] offset = f.tell() size = self.dump_user(username=username, fileobj=f, dump_time=dump_time) index[username] = { 'file_name': file_name, 'offset': offset, 'size': size, } next_user_id += 1 if file_done: continue if next_user_id == len(users): dump_complete = True break # add the listens directory to the archive self.log.info('Got all listens, adding them to the archive...') tar.add(listens_path, arcname=os.path.join(archive_name, 'listens')) # add index.json file to the archive try: index_path = os.path.join(temp_dir, 'index.json') with open(index_path, 'w') as f: f.write(ujson.dumps(index)) tar.add(index_path, arcname=os.path.join(archive_name, 'index.json')) except IOError as e: self.log.critical('IOError while writing index.json to archive: %s', str(e), exc_info=True) raise except Exception as e: self.log.error('Exception while adding index file to archive: %s', str(e), exc_info=True) raise # remove the temporary directory shutil.rmtree(temp_dir) pxz.stdin.close() pxz.wait() self.log.info('ListenBrainz listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path
def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump: bool, do_db_dump: bool): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from the listenstore. Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump do_listen_dump: If True, make a listens dump do_spark_dump: If True, make a spark listens dump do_db_dump: If True, make a public/private postgres/timescale dump """ app = create_app() with app.app_context(): ls = DumpListenStore(app) if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] ts = end_time.strftime('%Y%m%d-%H%M%S') dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=ts) dump_path = os.path.join(location, dump_name) create_path(dump_path) expected_num_dumps = 0 if do_db_dump: db_dump.dump_postgres_db(dump_path, end_time, threads) expected_num_dumps += 4 if do_listen_dump: ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads) expected_num_dumps += 1 if do_spark_dump: ls.dump_listens_for_spark(dump_path, dump_id=dump_id, dump_type="full", end_time=end_time) expected_num_dumps += 1 try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: # 6 types of dumps, archive, md5, sha256 for each expected_num_dump_files = expected_num_dumps * 3 if not sanity_check_dumps(dump_path, expected_num_dump_files): return sys.exit(-1) except OSError: sys.exit(-1) current_app.logger.info('Dumps created and hashes written at %s' % dump_path) # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s full\n" % (ts, dump_id)) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'fullexport') sys.exit(0)
def dump_listens(self, location, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT): """ Dumps all listens in the ListenStore into a .tar.xz archive. Files are created with UUIDs as names. Each file can contain listens for a number of users. An index.json file is used to save which file contains the listens of which users. Args: location: the directory where the listens dump archive should be created dump_time (datetime): the time at which the data dump was started threads (int): the number of threads to user for compression Returns: the path to the dump archive """ self.log.info('Beginning dump of listens from InfluxDB...') self.log.info( 'Getting list of users whose listens are to be dumped...') users = db_user.get_all_users(columns=['id', 'musicbrainz_id']) self.log.info('Total number of users: %d', len(users)) archive_name = 'listenbrainz-listens-dump-{time}'.format( time=dump_time.strftime('%Y%m%d-%H%M%S')) archive_path = os.path.join( location, '{filename}.tar.xz'.format(filename=archive_name)) with open(archive_path, 'w') as archive: pxz_command = [ 'pxz', '--compress', '-T{threads}'.format(threads=threads) ] pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = tempfile.mkdtemp() try: # add timestamp timestamp_path = os.path.join(temp_dir, 'TIMESTAMP') with open(timestamp_path, 'w') as f: f.write(dump_time.isoformat(' ')) tar.add(timestamp_path, arcname=os.path.join(archive_name, 'TIMESTAMP')) # add schema version schema_version_path = os.path.join(temp_dir, 'SCHEMA_SEQUENCE') with open(schema_version_path, 'w') as f: f.write(str(LISTENS_DUMP_SCHEMA_VERSION)) tar.add(schema_version_path, arcname=os.path.join(archive_name, 'SCHEMA_SEQUENCE')) # add copyright notice tar.add(DUMP_LICENSE_FILE_PATH, arcname=os.path.join(archive_name, 'COPYING')) except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error('Exception while adding dump metadata: %s', str(e)) raise listens_path = os.path.join(temp_dir, 'listens') dump_complete = False next_user_id = 0 index = {} while not dump_complete: file_name = str(uuid.uuid4()) # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid) directory = os.path.join(listens_path, file_name[0], file_name[0:2]) create_path(directory) file_path = os.path.join( directory, '{uuid}.listens'.format(uuid=file_name)) with open(file_path, 'w') as f: file_done = False while next_user_id < len(users): if f.tell() > DUMP_FILE_SIZE_LIMIT: file_done = True break username = users[next_user_id]['musicbrainz_id'] offset = f.tell() size = self.dump_user(username=username, fileobj=f, dump_time=dump_time) index[username] = { 'file_name': file_name, 'offset': offset, 'size': size, } next_user_id += 1 if file_done: continue if next_user_id == len(users): dump_complete = True break # add the listens directory to the archive self.log.info('Got all listens, adding them to the archive...') tar.add(listens_path, arcname=os.path.join(archive_name, 'listens')) # add index.json file to the archive try: index_path = os.path.join(temp_dir, 'index.json') with open(index_path, 'w') as f: f.write(ujson.dumps(index)) tar.add(index_path, arcname=os.path.join(archive_name, 'index.json')) except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error( 'Exception while adding index file to archive: %s', str(e)) raise # remove the temporary directory shutil.rmtree(temp_dir) pxz.stdin.close() self.log.info('ListenBrainz listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path
def create_incremental(location, threads, dump_id): app = create_app() with app.app_context(): ls = DumpListenStore(app) if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found, exiting!", dump_id) sys.exit(-1) end_time = dump_entry['created'] prev_dump_entry = db_dump.get_dump_entry(dump_id - 1) if prev_dump_entry is None: # incremental dumps must have a previous dump in the series current_app.logger.error( "Invalid dump ID %d, could not find previous dump", dump_id) sys.exit(-1) start_time = prev_dump_entry['created'] current_app.logger.info("Dumping data from %s to %s", start_time, end_time) dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format( dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) dump_path = os.path.join(location, dump_name) create_path(dump_path) ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads) ls.dump_listens_for_spark(dump_path, dump_id=dump_id, dump_type="incremental", start_time=start_time, end_time=end_time) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: if not sanity_check_dumps(dump_path, 6): return sys.exit(-1) except OSError as e: sys.exit(-1) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'incremental') # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s incremental\n" % (end_time.strftime('%Y%m%d-%H%M%S'), dump_id)) current_app.logger.info('Dumps created and hashes written at %s' % dump_path) sys.exit(0)
def create_full(location, threads, dump_id, last_dump_id): """ Create a ListenBrainz data dump which includes a private dump, a statistics dump and a dump of the actual listens from the listenstore Args: location (str): path to the directory where the dump should be made threads (int): the number of threads to be used while compression dump_id (int): the ID of the ListenBrainz data dump last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table """ app = create_app() with app.app_context(): from listenbrainz.webserver.timescale_connection import _ts as ls if last_dump_id: all_dumps = db_dump.get_dump_entries() if len(all_dumps) == 0: current_app.logger.error( "Cannot create full dump with last dump's ID, no dump exists!" ) sys.exit(-1) dump_id = all_dumps[0]['id'] if dump_id is None: end_time = datetime.now() dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s'))) else: dump_entry = db_dump.get_dump_entry(dump_id) if dump_entry is None: current_app.logger.error("No dump with ID %d found", dump_id) sys.exit(-1) end_time = dump_entry['created'] ts = end_time.strftime('%Y%m%d-%H%M%S') dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format( dump_id=dump_id, time=ts) dump_path = os.path.join(location, dump_name) create_path(dump_path) db_dump.dump_postgres_db(dump_path, end_time, threads) listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads) spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-full.tar.xz'.format( dump_id=dump_id, time=ts) spark_dump_path = os.path.join(location, dump_path, spark_dump_file) transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads) try: write_hashes(dump_path) except IOError as e: current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True) sys.exit(-1) try: if not sanity_check_dumps(dump_path, 12): return sys.exit(-1) except OSError as e: sys.exit(-1) # if in production, send an email to interested people for observability send_dump_creation_notification(dump_name, 'fullexport') current_app.logger.info('Dumps created and hashes written at %s' % dump_path) # Write the DUMP_ID file so that the FTP sync scripts can be more robust with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f: f.write("%s %s full\n" % (ts, dump_id)) sys.exit(0)
def test_cleanup_dumps(self): create_path( os.path.join(self.tempdir, 'listenbrainz-dump-1-20180312-000001-full')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-2-20180312-000002-full')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-3-20180312-000003-full')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-4-20180312-000004-full')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-1-20180312-000001-incremental')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-2-20180312-000002-incremental')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-3-20180312-000003-incremental')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-4-20180312-000004-incremental')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-5-20180312-000005-incremental')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-6-20180312-000006-incremental')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-7-20180312-000007-incremental')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-99-20200124-000007-incremental')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-100-20200124-000008-incremental')) create_path(os.path.join(self.tempdir, 'not-a-dump')) dump_manager._cleanup_dumps(self.tempdir) newdirs = os.listdir(self.tempdir) self.assertNotIn('listenbrainz-dump-1-20180312-000001-full', newdirs) self.assertNotIn('listenbrainz-dump-2-20180312-000002-full', newdirs) self.assertIn('listenbrainz-dump-3-20180312-000003-full', newdirs) self.assertIn('listenbrainz-dump-4-20180312-000004-full', newdirs) self.assertNotIn('listenbrainz-dump-1-20180312-000001-incremental', newdirs) self.assertNotIn('listenbrainz-dump-2-20180312-000002-incremental', newdirs) self.assertNotIn('listenbrainz-dump-3-20180312-000003-incremental', newdirs) self.assertIn('listenbrainz-dump-4-20180312-000004-incremental', newdirs) self.assertIn('listenbrainz-dump-5-20180312-000005-incremental', newdirs) self.assertIn('listenbrainz-dump-6-20180312-000006-incremental', newdirs) self.assertIn('listenbrainz-dump-7-20180312-000007-incremental', newdirs) self.assertIn('listenbrainz-dump-99-20200124-000007-incremental', newdirs) self.assertIn('listenbrainz-dump-100-20200124-000008-incremental', newdirs) self.assertIn('not-a-dump', newdirs)
def dump_listens_for_spark( self, location, dump_id: int, dump_type: str, start_time: datetime = datetime.utcfromtimestamp( DATA_START_YEAR_IN_SECONDS), end_time: datetime = None): """ Dumps all listens in the ListenStore into spark parquet files in a .tar archive. Listens are dumped into files ideally no larger than 128MB, sorted from oldest to newest. Files are named #####.parguet with monotonically increasing integers starting with 0. This creates an incremental dump if start_time is specified (with range start_time to end_time), otherwise it creates a full dump with all listens. Args: location: the directory where the listens dump archive should be created dump_id: the ID of the dump in the dump sequence dump_type: type of dump, full or incremental start_time: the start of the time range for which listens should be dumped. defaults to utc 0 (meaning a full dump) end_time: the end of time range for which listens should be dumped. defaults to the current time Returns: the path to the dump archive """ if end_time is None: end_time = datetime.now() self.log.info('Beginning spark dump of listens from TimescaleDB...') full_dump = bool(start_time == datetime.utcfromtimestamp( DATA_START_YEAR_IN_SECONDS)) archive_name = 'listenbrainz-spark-dump-{dump_id}-{time}'.format( dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) if full_dump: archive_name = '{}-full'.format(archive_name) else: archive_name = '{}-incremental'.format(archive_name) archive_path = os.path.join( location, '{filename}.tar'.format(filename=archive_name)) parquet_index = 0 with tarfile.open(archive_path, "w") as tar: temp_dir = os.path.join(self.dump_temp_dir_root, str(uuid.uuid4())) create_path(temp_dir) self.write_dump_metadata(archive_name, start_time, end_time, temp_dir, tar, full_dump) for year in range(start_time.year, end_time.year + 1): if year == start_time.year: start = start_time else: start = datetime(year=year, day=1, month=1) if year == end_time.year: end = end_time else: end = datetime(year=year + 1, day=1, month=1) self.log.info("dump %s to %s" % (start.strftime("%Y-%m-%d %H:%M:%S"), end.strftime("%Y-%m-%d %H:%M:%S"))) # This try block is here in an effort to expose bugs that occur during testing # Without it sometimes test pass and sometimes they give totally unrelated errors. # Keeping this block should help with future testing... try: parquet_index = self.write_parquet_files( archive_name, temp_dir, tar, dump_type, start, end, parquet_index) except Exception as err: self.log.info("likely test failure: " + str(err)) raise shutil.rmtree(temp_dir) self.log.info('ListenBrainz spark listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path
def _create_dump(location, dump_type, tables, dump_time, threads=DUMP_DEFAULT_THREAD_COUNT): """ Creates a dump of the provided tables at the location passed Arguments: location: the path where the dump should be created dump_type: the type of data dump being made - private or public tables: a dict containing the names of the tables to be dumped as keys and the columns to be dumped as values dump_time: the time at which the dump process was started threads: the maximum number of threads to use for compression Returns: the path to the archive file created """ archive_name = 'listenbrainz-{dump_type}-dump-{time}'.format( dump_type=dump_type, time=dump_time.strftime('%Y%m%d-%H%M%S') ) archive_path = os.path.join(location, '{archive_name}.tar.xz'.format( archive_name=archive_name, )) with open(archive_path, 'w') as archive: pxz_command = ['pxz', '--compress', '-T{threads}'.format(threads=threads)] pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = tempfile.mkdtemp() try: schema_seq_path = os.path.join(temp_dir, "SCHEMA_SEQUENCE") with open(schema_seq_path, "w") as f: f.write(str(db.SCHEMA_VERSION)) tar.add(schema_seq_path, arcname=os.path.join(archive_name, "SCHEMA_SEQUENCE")) timestamp_path = os.path.join(temp_dir, "TIMESTAMP") with open(timestamp_path, "w") as f: f.write(dump_time.isoformat(" ")) tar.add(timestamp_path, arcname=os.path.join(archive_name, "TIMESTAMP")) tar.add(DUMP_LICENSE_FILE_PATH, arcname=os.path.join(archive_name, "COPYING")) except IOError as e: current_app.logger.error('IOError while adding dump metadata: %s', str(e), exc_info=True) raise except Exception as e: current_app.logger.error('Exception while adding dump metadata: %s', str(e), exc_info=True) raise archive_tables_dir = os.path.join(temp_dir, 'lbdump', 'lbdump') create_path(archive_tables_dir) with db.engine.connect() as connection: with connection.begin() as transaction: cursor = connection.connection.cursor() for table in tables: try: copy_table( cursor=cursor, location=archive_tables_dir, columns=','.join(tables[table]), table_name=table, ) except IOError as e: current_app.logger.error('IOError while copying table %s', table, exc_info=True) raise except Exception as e: current_app.logger.error('Error while copying table %s: %s', table, str(e), exc_info=True) raise transaction.rollback() tar.add(archive_tables_dir, arcname=os.path.join(archive_name, 'lbdump'.format(dump_type))) shutil.rmtree(temp_dir) pxz.stdin.close() pxz.wait() return archive_path
def test_cleanup_dumps(self): create_path( os.path.join(self.tempdir, 'listenbrainz-dump-1-20180312-000001-full')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-2-20180312-000002-full')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-3-20180312-000003-full')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-4-20180312-000004-full')) for i in range(1, 50): create_path( os.path.join( self.tempdir, 'listenbrainz-dump-%d-20180312-%06d-incremental' % (i, i))) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-99-20200124-000007-incremental')) create_path( os.path.join(self.tempdir, 'listenbrainz-dump-100-20200124-000008-incremental')) create_path( os.path.join(self.tempdir, 'listenbrainz-feedback-20180312-000001-full')) create_path( os.path.join(self.tempdir, 'listenbrainz-feedback-20180312-000002-full')) create_path( os.path.join(self.tempdir, 'listenbrainz-feedback-20180312-000003-full')) create_path( os.path.join(self.tempdir, 'listenbrainz-feedback-20180312-000004-full')) create_path(os.path.join(self.tempdir, 'not-a-dump')) dump_manager._cleanup_dumps(self.tempdir) newdirs = os.listdir(self.tempdir) self.assertNotIn('listenbrainz-dump-1-20180312-000001-full', newdirs) self.assertNotIn('listenbrainz-dump-2-20180312-000002-full', newdirs) self.assertIn('listenbrainz-dump-3-20180312-000003-full', newdirs) self.assertIn('listenbrainz-dump-4-20180312-000004-full', newdirs) self.assertNotIn('listenbrainz-dump-1-20180312-000001-incremental', newdirs) self.assertNotIn('listenbrainz-dump-2-20180312-000002-incremental', newdirs) self.assertNotIn('listenbrainz-dump-3-20180312-000003-incremental', newdirs) self.assertNotIn('listenbrainz-dump-21-20180312-000003-incremental', newdirs) for i in range(22, 50): self.assertIn( 'listenbrainz-dump-%d-20180312-%06d-incremental' % (i, i), newdirs) self.assertIn('listenbrainz-dump-99-20200124-000007-incremental', newdirs) self.assertIn('listenbrainz-dump-100-20200124-000008-incremental', newdirs) self.assertNotIn('listenbrainz-feedback-20180312-000001-full', newdirs) self.assertNotIn('listenbrainz-feedback-20180312-000002-full', newdirs) self.assertIn('listenbrainz-feedback-20180312-000003-full', newdirs) self.assertIn('listenbrainz-feedback-20180312-000004-full', newdirs) self.assertIn('not-a-dump', newdirs)
def dump_listens(self, location, dump_time=datetime.today(), threads=None): """ Fetches listens of each user from her measurement and dumps them into a file. These files are compressed into an archive. Args: location: the directory where the listens dump archive should be created dump_time (datetime): the time at which the data dump was started threads (int): the number of threads to user for compression Returns: the path to the dump archive """ self.log.info('Beginning dump of listens from InfluxDB...') self.log.info( 'Getting list of users whose listens are to be dumped...') users = db_user.get_all_users() self.log.info('Total number of users: %d', len(users)) archive_name = 'listenbrainz-listens-dump-{time}'.format( time=dump_time.strftime('%Y%m%d-%H%M%S')) archive_path = os.path.join( location, '{filename}.tar.xz'.format(filename=archive_name)) with open(archive_path, 'w') as archive: pxz_command = ['pxz', '--compress'] if threads is not None: pxz_command.append('-T {threads}'.format(threads=threads)) pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = tempfile.mkdtemp() try: # add timestamp timestamp_path = os.path.join(temp_dir, 'TIMESTAMP') with open(timestamp_path, 'w') as f: f.write(dump_time.isoformat(' ')) tar.add(timestamp_path, arcname=os.path.join(archive_name, 'TIMESTAMP')) # add schema version schema_version_path = os.path.join(temp_dir, 'SCHEMA_SEQUENCE') with open(schema_version_path, 'w') as f: f.write(str(LISTENS_DUMP_SCHEMA_VERSION)) tar.add(schema_version_path, arcname=os.path.join(archive_name, 'SCHEMA_SEQUENCE')) # add copyright notice tar.add(DUMP_LICENSE_FILE_PATH, arcname=os.path.join(archive_name, 'COPYING')) except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error('Exception while adding dump metadata: %s', str(e)) raise listens_path = os.path.join(temp_dir, 'listens') create_path(listens_path) # get listens from all measurements and write them to files in # a temporary dir before adding them to the archive for user in users: username = user['musicbrainz_id'] offset = 0 user_listens_file = '{username}.listens'.format( username=username) user_listens_path = os.path.join(listens_path, user_listens_file) with open(user_listens_path, 'w') as f: # Get this user's listens in chunks while True: # loop until we get this chunk of listens while True: try: result = self.influx.query(""" SELECT * FROM {measurement} WHERE time <= {timestamp} ORDER BY time DESC LIMIT {limit} OFFSET {offset} """.format( measurement= get_escaped_measurement_name(username), timestamp=get_influx_query_timestamp( dump_time.strftime('%s')), limit=DUMP_CHUNK_SIZE, offset=offset, )) break except Exception as e: self.log.error( 'Error while getting listens for user %s', user['musicbrainz_id']) self.log.error(str(e)) time.sleep(3) rows = list( result.get_points( get_measurement_name(username))) if not rows: break for row in rows: listen = Listen.from_influx(row).to_api() try: f.write(ujson.dumps(listen)) f.write('\n') except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error( 'Exception while creating json for user: %s', user['musicbrainz_id']) self.log.error(str(e)) raise offset += DUMP_CHUNK_SIZE # add the listens directory to the archive self.log.info('Got all listens, adding them to the archive...') tar.add(listens_path, arcname=os.path.join(archive_name, 'listens')) # remove the temporary directory shutil.rmtree(temp_dir) pxz.stdin.close() self.log.info('ListenBrainz listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path