def create(location, threads):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from InfluxDB

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
    """
    db.init_db_connection(config.SQLALCHEMY_DATABASE_URI)
    ls = init_influx_connection(log,  {
        'REDIS_HOST': config.REDIS_HOST,
        'REDIS_PORT': config.REDIS_PORT,
        'REDIS_NAMESPACE': config.REDIS_NAMESPACE,
        'INFLUX_HOST': config.INFLUX_HOST,
        'INFLUX_PORT': config.INFLUX_PORT,
        'INFLUX_DB_NAME': config.INFLUX_DB_NAME,
    })
    time_now = datetime.today()
    dump_path = os.path.join(location, 'listenbrainz-dump-{time}'.format(time=time_now.strftime('%Y%m%d-%H%M%S')))
    create_path(dump_path)
    db_dump.dump_postgres_db(dump_path, time_now, threads)
    ls.dump_listens(dump_path, time_now, threads)
    try:
        write_hashes(dump_path)
    except IOError as e:
        log.error('Unable to create hash files! Error: %s', str(e))
        return
    log.info('Dumps created and hashes written at %s' % dump_path)
Esempio n. 2
0
def create(location, threads):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from InfluxDB

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
    """
    app = create_app()
    with app.app_context():
        ls = init_influx_connection(
            current_app.logger, {
                'REDIS_HOST': current_app.config['REDIS_HOST'],
                'REDIS_PORT': current_app.config['REDIS_PORT'],
                'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'],
                'INFLUX_HOST': current_app.config['INFLUX_HOST'],
                'INFLUX_PORT': current_app.config['INFLUX_PORT'],
                'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'],
            })
        time_now = datetime.today()
        dump_path = os.path.join(
            location, 'listenbrainz-dump-{time}'.format(
                time=time_now.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, time_now, threads)
        ls.dump_listens(dump_path, time_now, threads)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            return
        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)
def create(location, threads):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from InfluxDB

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
    """
    app = create_app()
    with app.app_context():
        ls = init_influx_connection(current_app.logger,  {
            'REDIS_HOST': current_app.config['REDIS_HOST'],
            'REDIS_PORT': current_app.config['REDIS_PORT'],
            'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'],
            'INFLUX_HOST': current_app.config['INFLUX_HOST'],
            'INFLUX_PORT': current_app.config['INFLUX_PORT'],
            'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'],
        })
        time_now = datetime.today()
        dump_path = os.path.join(location, 'listenbrainz-dump-{time}'.format(time=time_now.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, time_now, threads)
        ls.dump_listens(dump_path, time_now, threads)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
            return
        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
def create_full(location, threads, dump_id, last_dump_id):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from InfluxDB

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
            last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table
    """
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.influx_connection import _influx as ls
        if last_dump_id:
            all_dumps = db_dump.get_dump_entries()
            if len(all_dumps) == 0:
                current_app.logger.error(
                    "Cannot create full dump with last dump's ID, no dump exists!"
                )
                sys.exit(-1)
            dump_id = all_dumps[0]['id']

        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        dump_path = os.path.join(
            location, 'listenbrainz-dump-{dump_id}-{time}-full'.format(
                dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')))
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, end_time, threads)
        ls.dump_listens(dump_path,
                        dump_id=dump_id,
                        end_time=end_time,
                        threads=threads,
                        spark_format=False)
        ls.dump_listens(dump_path,
                        dump_id=dump_id,
                        end_time=end_time,
                        threads=threads,
                        spark_format=True)
        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            return
        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)
Esempio n. 5
0
    def test_import_postgres_db(self):

        # create a user
        db_user.create(1, 'test_user')
        user_count = db_user.get_user_count()
        self.assertEqual(user_count, 1)

        # do a db dump and reset the db
        private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir)
        self.reset_db()
        user_count = db_user.get_user_count()
        self.assertEqual(user_count, 0)

        # import the dump
        db_dump.import_postgres_dump(private_dump, public_dump)
        user_count = db_user.get_user_count()
        self.assertEqual(user_count, 1)

        # reset again, and use more threads to import
        self.reset_db()
        user_count = db_user.get_user_count()
        self.assertEqual(user_count, 0)

        db_dump.import_postgres_dump(private_dump, public_dump, threads=2)
        user_count = db_user.get_user_count()
        self.assertEqual(user_count, 1)
Esempio n. 6
0
    def test_import_postgres_db(self):

        # create a user
        with self.app.app_context():
            one_id = db_user.create(1, 'test_user')
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 1)

            # do a db dump and reset the db
            private_dump, private_ts_dump, public_dump, public_ts_dump = db_dump.dump_postgres_db(
                self.tempdir)
            self.reset_db()
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 0)

            # import the dump
            db_dump.import_postgres_dump(private_dump, None, public_dump, None)
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 1)

            # reset again, and use more threads to import
            self.reset_db()
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 0)

            db_dump.import_postgres_dump(private_dump,
                                         None,
                                         public_dump,
                                         None,
                                         threads=2)
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 1)
            two_id = db_user.create(2, 'vnskprk')
            self.assertGreater(two_id, one_id)
Esempio n. 7
0
    def test_dump_recording_feedback(self):

        # create a user
        with self.app.app_context():
            one_id = db_user.create(1, 'test_user')
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 1)

            # insert a feedback record
            feedback = Feedback(
                user_id=one_id,
                recording_msid="d23f4719-9212-49f0-ad08-ddbfbfc50d6f",
                score=1)
            db_feedback.insert(feedback)

            # do a db dump and reset the db
            private_dump, private_ts_dump, public_dump, public_ts_dump = db_dump.dump_postgres_db(
                self.tempdir)
            self.reset_db()
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 0)
            self.assertEqual(
                db_feedback.get_feedback_count_for_user(user_id=one_id), 0)

            # import the dump and check the records are inserted
            db_dump.import_postgres_dump(private_dump, None, public_dump, None)
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 1)

            dumped_feedback = db_feedback.get_feedback_for_user(user_id=one_id,
                                                                limit=1,
                                                                offset=0)
            self.assertEqual(len(dumped_feedback), 1)
            self.assertEqual(dumped_feedback[0].user_id, feedback.user_id)
            self.assertEqual(dumped_feedback[0].recording_msid,
                             feedback.recording_msid)
            self.assertEqual(dumped_feedback[0].score, feedback.score)

            # reset again, and use more threads to import
            self.reset_db()
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 0)
            dumped_feedback = []

            db_dump.import_postgres_dump(private_dump,
                                         None,
                                         public_dump,
                                         None,
                                         threads=2)
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 1)

            dumped_feedback = db_feedback.get_feedback_for_user(user_id=one_id,
                                                                limit=1,
                                                                offset=0)
            self.assertEqual(len(dumped_feedback), 1)
            self.assertEqual(dumped_feedback[0].user_id, feedback.user_id)
            self.assertEqual(dumped_feedback[0].recording_msid,
                             feedback.recording_msid)
            self.assertEqual(dumped_feedback[0].score, feedback.score)
    def test_import_postgres_db(self):

        # create a user
        db_user.create('test_user')
        user_count = db_user.get_user_count()
        self.assertEqual(user_count, 1)

        # do a db dump and reset the db
        location = db_dump.dump_postgres_db(self.tempdir)
        self.reset_db()
        user_count = db_user.get_user_count()
        self.assertEqual(user_count, 0)

        # import the dump
        db_dump.import_postgres_dump(location)
        user_count = db_user.get_user_count()
        self.assertEqual(user_count, 1)

        # reset again, and use more threads to import
        self.reset_db()
        user_count = db_user.get_user_count()
        self.assertEqual(user_count, 0)

        db_dump.import_postgres_dump(location, threads=2)
        user_count = db_user.get_user_count()
        self.assertEqual(user_count, 1)
 def test_dump_postgres_db_table_entries(self):
     db_user.create('test_user')
     timestamp = datetime.today()
     location = db_dump.dump_postgres_db(self.tempdir, dump_time=timestamp)
     dump_entries = db_dump.get_dump_entries()
     self.assertEqual(len(dump_entries), 1)
     self.assertEqual(dump_entries[0]['created'].strftime('%s'), timestamp.strftime('%s'))
Esempio n. 10
0
    def test_import_postgres_db(self):

        # create a user
        with self.app.app_context():
            one_id = db_user.create(1, 'test_user')
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 1)

            # do a db dump and reset the db
            private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir)
            self.reset_db()
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 0)

            # import the dump
            db_dump.import_postgres_dump(private_dump, public_dump)
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 1)

            # reset again, and use more threads to import
            self.reset_db()
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 0)

            db_dump.import_postgres_dump(private_dump, public_dump, threads=2)
            user_count = db_user.get_user_count()
            self.assertEqual(user_count, 1)
            two_id = db_user.create(2, 'vnskprk')
            self.assertGreater(two_id, one_id)
Esempio n. 11
0
 def test_dump_postgres_db_table_entries(self):
     with self.app.app_context():
         db_user.create(1, 'test_user')
         timestamp = datetime.today()
         location = db_dump.dump_postgres_db(self.tempdir, dump_time=timestamp)
         dump_entries = db_dump.get_dump_entries()
         self.assertEqual(len(dump_entries), 1)
         self.assertEqual(dump_entries[0]['created'].strftime('%s'), timestamp.strftime('%s'))
Esempio n. 12
0
def create_full(location, threads, dump_id, last_dump_id):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from the listenstore

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
            last_dump_id (bool): flag indicating whether to create a full dump from the last entry in the dump table
    """
    app = create_app()
    with app.app_context():
        from listenbrainz.webserver.timescale_connection import _ts as ls
        if last_dump_id:
            all_dumps = db_dump.get_dump_entries()
            if len(all_dumps) == 0:
                current_app.logger.error(
                    "Cannot create full dump with last dump's ID, no dump exists!"
                )
                sys.exit(-1)
            dump_id = all_dumps[0]['id']

        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        ts = end_time.strftime('%Y%m%d-%H%M%S')
        dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format(
            dump_id=dump_id, time=ts)
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)
        db_dump.dump_postgres_db(dump_path, end_time, threads)

        listens_dump_file = ls.dump_listens(dump_path,
                                            dump_id=dump_id,
                                            end_time=end_time,
                                            threads=threads)
        spark_dump_file = 'listenbrainz-listens-dump-{dump_id}-{time}-spark-full.tar.xz'.format(
            dump_id=dump_id, time=ts)
        spark_dump_path = os.path.join(location, dump_path, spark_dump_file)
        transmogrify_dump_file_to_spark_import_format(listens_dump_file,
                                                      spark_dump_path, threads)

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            if not sanity_check_dumps(dump_path, 12):
                return sys.exit(-1)
        except OSError as e:
            sys.exit(-1)

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'fullexport')

        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s %s full\n" % (ts, dump_id))

        sys.exit(0)
def create_full(location, threads, dump_id, do_listen_dump: bool,
                do_spark_dump: bool, do_db_dump: bool):
    """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
        and a dump of the actual listens from the listenstore.

        Args:
            location (str): path to the directory where the dump should be made
            threads (int): the number of threads to be used while compression
            dump_id (int): the ID of the ListenBrainz data dump
            do_listen_dump: If True, make a listens dump
            do_spark_dump: If True, make a spark listens dump
            do_db_dump: If True, make a public/private postgres/timescale dump
    """
    app = create_app()
    with app.app_context():
        ls = DumpListenStore(app)
        if dump_id is None:
            end_time = datetime.now()
            dump_id = db_dump.add_dump_entry(int(end_time.strftime('%s')))
        else:
            dump_entry = db_dump.get_dump_entry(dump_id)
            if dump_entry is None:
                current_app.logger.error("No dump with ID %d found", dump_id)
                sys.exit(-1)
            end_time = dump_entry['created']

        ts = end_time.strftime('%Y%m%d-%H%M%S')
        dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format(
            dump_id=dump_id, time=ts)
        dump_path = os.path.join(location, dump_name)
        create_path(dump_path)

        expected_num_dumps = 0
        if do_db_dump:
            db_dump.dump_postgres_db(dump_path, end_time, threads)
            expected_num_dumps += 4
        if do_listen_dump:
            ls.dump_listens(dump_path,
                            dump_id=dump_id,
                            end_time=end_time,
                            threads=threads)
            expected_num_dumps += 1
        if do_spark_dump:
            ls.dump_listens_for_spark(dump_path,
                                      dump_id=dump_id,
                                      dump_type="full",
                                      end_time=end_time)
            expected_num_dumps += 1

        try:
            write_hashes(dump_path)
        except IOError as e:
            current_app.logger.error('Unable to create hash files! Error: %s',
                                     str(e),
                                     exc_info=True)
            sys.exit(-1)

        try:
            # 6 types of dumps, archive, md5, sha256 for each
            expected_num_dump_files = expected_num_dumps * 3
            if not sanity_check_dumps(dump_path, expected_num_dump_files):
                return sys.exit(-1)
        except OSError:
            sys.exit(-1)

        current_app.logger.info('Dumps created and hashes written at %s' %
                                dump_path)

        # Write the DUMP_ID file so that the FTP sync scripts can be more robust
        with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
            f.write("%s %s full\n" % (ts, dump_id))

        # if in production, send an email to interested people for observability
        send_dump_creation_notification(dump_name, 'fullexport')

        sys.exit(0)