Example #1
0
def create_full(location, threads, dump_id, last_dump_id):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from the listenstore

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
            last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table
    """
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.timescale_connection import _ts as ls
        if last_dump_id:
            all_dumps = db_dump.get_dump_entries()
            if len(all_dumps) == 0:
                current_app.logger.error("Cannot create full dump with last dump's ID, no dump exists!")
                sys.exit(-1)
            dump_id = all_dumps[0]['id']

        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, end_time, threads)

        listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads)
        spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-full.tar.xz'.format(dump_id=dump_id,
                           time=end_time.strftime('%Y%m%d-%H%M%S'))
        spark_dump_path = os.path.join(location, dump_path, spark_dump_file)
        transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads)

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
            return


        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'fullexport')

        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
Example #2
0
def create_incremental(location, threads, dump_id):
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.timescale_connection import _ts as ls
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found, exiting!", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        prev_dump_entry = db_dump.get_dump_entry(dump_id - 1)
        if prev_dump_entry is None: # incremental dumps must have a previous dump in the series
            current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id)
            sys.exit(-1)
        start_time = prev_dump_entry['created']
        current_app.logger.info("Dumping data from %s to %s", start_time, end_time)

        dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)
        listens_dump_file = ls.dump_listens(dump_path, dump_id=dump_id, start_time=start_time, end_time=end_time, threads=threads)
        spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-incremental.tar.xz'.format(dump_id=dump_id,
                           time=end_time.strftime('%Y%m%d-%H%M%S'))
        spark_dump_path = os.path.join(location, dump_path, spark_dump_file)
        transmogrify_dump_file_to_spark_import_format(listens_dump_file, spark_dump_path, threads)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
            return

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'incremental')

        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
Example #3
0
def create_full(location, threads, dump_id):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from the listenstore

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
    """
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.timescale_connection import _ts as ls
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        ts = end_time.strftime('%Y%m%d-%H%M%S')
        dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format(
            dump_id=dump_id, time=ts)
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)

        db_dump.dump_postgres_db(dump_path, end_time, threads)
        ls.dump_listens(dump_path,
                        dump_id=dump_id,
                        end_time=end_time,
                        threads=threads)
        ls.dump_listens_for_spark(dump_path,
                                  dump_id=dump_id,
                                  end_time=end_time)

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            if not sanity_check_dumps(dump_path, 12):
                return sys.exit(-1)
        except OSError as e:
            sys.exit(-1)

        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s %s full\n" % (ts, dump_id))

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'fullexport')

        sys.exit(0)
Example #4
0
def create_incremental(location, threads, dump_id):
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.timescale_connection import _ts as ls
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found, exiting!",
                                         dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        prev_dump_entry = db_dump.get_dump_entry(dump_id - 1)
        if prev_dump_entry is None:  # incremental dumps must have a previous dump in the series
            current_app.logger.error(
                "Invalid dump ID %d, could not find previous dump", dump_id)
            sys.exit(-1)
        start_time = prev_dump_entry['created']
        current_app.logger.info("Dumping data from %s to %s", start_time,
                                end_time)

        dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(
            dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)

        ls.dump_listens(dump_path,
                        dump_id=dump_id,
                        start_time=start_time,
                        end_time=end_time,
                        threads=threads)
        ls.dump_listens_for_spark(dump_path,
                                  dump_id=dump_id,
                                  start_time=start_time,
                                  end_time=end_time)

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            if not sanity_check_dumps(dump_path, 6):
                return sys.exit(-1)
        except OSError as e:
            sys.exit(-1)

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'incremental')

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s %s incremental\n" %
                    (end_time.strftime('%Y%m%d-%H%M%S'), dump_id))

        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)
        sys.exit(0)