Python get_all_users Examples, listenbrainz.db.user.get_all_users Python Examples

Example #1

0

Show file

File: test_user.py Project: xcodeuuuuu66699/listenbrainz-server

    def test_get_all_users(self):
        """ Tests that get_all_users returns ALL users in the db """

        users = db_user.get_all_users()
        self.assertEqual(len(users), 0)
        db_user.create(8, 'user1')
        users = db_user.get_all_users()
        self.assertEqual(len(users), 1)
        db_user.create(9, 'user2')
        users = db_user.get_all_users()
        self.assertEqual(len(users), 2)

Example #2

0

Show file

File: populate.py Project: macressler/listenbrainz-server

def push_users_to_queue(channel, force=False):
    """ Get users from the db whose stats haven't been calculated and push
        them into the queue.

        Args:
            channel: the RabbitMQ channel in which we should publish the user data
    """
    logger.info('pushing users to stats calculation queue...')
    if force:
        users = db_user.get_all_users()
    else:
        users = db_user.get_users_with_uncalculated_stats()

    for user in users:
        data = {
            'type': 'user',
            'id': user['id'],
            'musicbrainz_id': user['musicbrainz_id']
        }

        channel.basic_publish(
            exchange=config.BIGQUERY_EXCHANGE,
            routing_key='',
            body=ujson.dumps(data),
            properties=pika.BasicProperties(delivery_mode=2, ),
        )
    logger.info('pushed %d users!', len(users))

Example #3

0

Show file

File: populate.py Project: Uditgulati/listenbrainz-server

def push_users_to_queue(channel, force=False):
    """ Get users from the db whose stats haven't been calculated and push
        them into the queue.

        Args:
            channel: the RabbitMQ channel in which we should publish the user data
    """
    logger.info('pushing users to stats calculation queue...')
    if force:
        users = db_user.get_all_users()
    else:
        users = db_user.get_users_with_uncalculated_stats()

    for user in users:
        data = {
            'type': 'user',
            'id': user['id'],
            'musicbrainz_id': user['musicbrainz_id']
        }

        channel.basic_publish(
            exchange=config.BIGQUERY_EXCHANGE,
            routing_key='',
            body=ujson.dumps(data),
            properties=pika.BasicProperties(delivery_mode = 2,),
        )
    logger.info('pushed %d users!', len(users))

Example #4

0

Show file

    def find_users(self):
        with self.app.app_context():
            self.ls = init_influx_connection(
                current_app.logger, {
                    'REDIS_HOST': current_app.config['REDIS_HOST'],
                    'REDIS_PORT': current_app.config['REDIS_PORT'],
                    'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'],
                    'INFLUX_HOST': current_app.config['INFLUX_HOST'],
                    'INFLUX_PORT': current_app.config['INFLUX_PORT'],
                    'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'],
                })

            for _ in range(CONNECTION_RETRY_COUNT):
                try:
                    users = db_user.get_all_users()
                    break
                except DatabaseError as e:
                    current_app.logger.error(
                        'Error while getting users list: %s',
                        str(e),
                        exc_info=True)
                    time.sleep(1)
            else:
                current_app.logger.critical(
                    "Cannot connect to PostgreSQL, exiting...")
                raise DatabaseError("Cannot connect to PostgreSQL, exiting")

            return [
                user['musicbrainz_id'] for user in users
                if self.condition(user['musicbrainz_id'])
            ]

Example #5

0

Show file

File: test_user.py Project: xcodeuuuuu66699/listenbrainz-server

    def test_get_all_users_columns(self):
        """ Tests that get_all_users only returns those columns which are asked for """

        # check that all columns of the user table are present
        # if columns is not specified
        users = db_user.get_all_users()
        for user in users:
            for column in db_user.USER_GET_COLUMNS:
                self.assertIn(column, user)

        # check that only id is present if columns = ['id']
        users = db_user.get_all_users(columns=['id'])
        for user in users:
            self.assertIn('id', user)
            for column in db_user.USER_GET_COLUMNS:
                if column != 'id':
                    self.assertNotIn(column, user)

Example #6

0

Show file

File: import_musicbrainz_row_ids.py Project: Uditgulati/listenbrainz-server

def import_musicbrainz_rows(musicbrainz_db_uri, dry_run=True, delete=False):
    musicbrainz_db.init_db_engine(musicbrainz_db_uri)
    db.init_db_connection(app.config['SQLALCHEMY_DATABASE_URI'])
    import_count = 0
    already_imported = 0
    not_found = 0
    deleted = 0

    if not dry_run:
        update_row_ids_for_exceptions()
    users = db_user.get_all_users()
    with musicbrainz_db.engine.connect() as mb_connection:
        with db.engine.connect() as connection:
            for user in users:
                if user.get('musicbrainz_row_id') is not None:
                    already_imported += 1
                    continue
                name = user['musicbrainz_id']
                result = mb_connection.execute(sqlalchemy.text("""
                        SELECT id
                          FROM editor
                         WHERE LOWER(name) = LOWER(:name)
                    """), {
                        'name': name,
                    })
                musicbrainz_row_id = None
                if result.rowcount > 0:
                    musicbrainz_row_id = result.fetchone()['id']
                    import_count += 1
                else:
                    print('No user with specified username in the MusicBrainz db: %s' % name)
                    if delete:
                        print('Deleting user %s' % name)
                        try:
                            delete_user(user)
                        except NotFound:
                            print('User %s not found in LB...' % name)
                    not_found += 1
                    continue

                if not dry_run:
                    connection.execute(sqlalchemy.text("""
                            UPDATE "user"
                               SET musicbrainz_row_id = :musicbrainz_row_id
                             WHERE id = :id
                        """), {
                            'musicbrainz_row_id': musicbrainz_row_id,
                            'id': user['id'],
                        })
                    print('Inserted row_id %d for user %s' % (musicbrainz_row_id, name))

    print('Total number of ListenBrainz users: %d' % len(users))
    print('Total number of ListenBrainz users with already imported row ids: %d' % already_imported)
    print('Total number of ListenBrainz users whose row ids can be imported: %d' % import_count)
    print('Total number of ListenBrainz users not found in MusicBrainz: %d' % not_found)
    print('Total number of ListenBrainz users deleted from MusicBrainz: %d' % deleted)

Example #7

0

Show file

    def test_import_dump_many_users(self):
        for i in range(50):
            db_user.create('user%d' % i)

        temp_dir = tempfile.mkdtemp()
        dump_location = self.logstore.dump_listens(location=temp_dir, )
        sleep(1)
        self.assertTrue(os.path.isfile(dump_location))
        self.reset_influx_db()

        done = self.logstore.import_listens_dump(dump_location)
        sleep(1)
        self.assertEqual(done, len(db_user.get_all_users()))

Example #8

0

Show file

File: test_influxlistenstore.py Project: Uditgulati/listenbrainz-server

    def test_import_dump_many_users(self):
        for i in range(2, 52):
            db_user.create(i, 'user%d' % i)

        temp_dir = tempfile.mkdtemp()
        dump_location = self.logstore.dump_listens(
            location=temp_dir,
        )
        sleep(1)
        self.assertTrue(os.path.isfile(dump_location))
        self.reset_influx_db()

        done = self.logstore.import_listens_dump(dump_location)
        sleep(1)
        self.assertEqual(done, len(db_user.get_all_users()))

Example #9

0

Show file

    def test_import_dump_many_users(self):
        for i in range(2, 52):
            db_user.create(i, 'user%d' % i)

        temp_dir = tempfile.mkdtemp()
        dump_location = self.logstore.dump_listens(
            location=temp_dir,
            dump_id=1,
            end_time=datetime.now(),
        )
        sleep(1)
        self.assertTrue(os.path.isfile(dump_location))
        self.reset_influx_db()

        done = self.logstore.import_listens_dump(dump_location)
        sleep(1)
        self.assertEqual(done, len(db_user.get_all_users()))
        shutil.rmtree(temp_dir)

Example #10

0

Show file

def import_musicbrainz_rows(musicbrainz_db_uri, dry_run=True, delete=False):
    musicbrainz_db.init_db_engine(musicbrainz_db_uri)
    db.init_db_connection(app.config['SQLALCHEMY_DATABASE_URI'])
    import_count = 0
    already_imported = 0
    not_found = 0
    deleted = 0

    if not dry_run:
        update_row_ids_for_exceptions()
    users = db_user.get_all_users()
    with musicbrainz_db.engine.connect() as mb_connection:
        with db.engine.connect() as connection:
            for user in users:
                if user.get('musicbrainz_row_id') is not None:
                    already_imported += 1
                    continue
                name = user['musicbrainz_id']
                result = mb_connection.execute(
                    sqlalchemy.text("""
                        SELECT id
                          FROM editor
                         WHERE LOWER(name) = LOWER(:name)
                    """), {
                        'name': name,
                    })
                musicbrainz_row_id = None
                if result.rowcount > 0:
                    musicbrainz_row_id = result.fetchone()['id']
                    import_count += 1
                else:
                    print(
                        'No user with specified username in the MusicBrainz db: %s'
                        % name)
                    if delete:
                        print('Deleting user %s' % name)
                        try:
                            delete_user(user)
                        except NotFound:
                            print('User %s not found in LB...' % name)
                    not_found += 1
                    continue

                if not dry_run:
                    connection.execute(
                        sqlalchemy.text("""
                            UPDATE "user"
                               SET musicbrainz_row_id = :musicbrainz_row_id
                             WHERE id = :id
                        """), {
                            'musicbrainz_row_id': musicbrainz_row_id,
                            'id': user['id'],
                        })
                    print('Inserted row_id %d for user %s' %
                          (musicbrainz_row_id, name))

    print('Total number of ListenBrainz users: %d' % len(users))
    print(
        'Total number of ListenBrainz users with already imported row ids: %d'
        % already_imported)
    print(
        'Total number of ListenBrainz users whose row ids can be imported: %d'
        % import_count)
    print('Total number of ListenBrainz users not found in MusicBrainz: %d' %
          not_found)
    print('Total number of ListenBrainz users deleted from MusicBrainz: %d' %
          deleted)

Example #11

0

Show file

File: influx_listenstore.py Project: Salies/listenbrainz-server

    def dump_listens(self,
                     location,
                     dump_time=datetime.today(),
                     threads=DUMP_DEFAULT_THREAD_COUNT,
                     spark_format=False):
        """ Dumps all listens in the ListenStore into a .tar.xz archive.

        Files are created with UUIDs as names. Each file can contain listens for a number of users.
        An index.json file is used to save which file contains the listens of which users.

        Args:
            location: the directory where the listens dump archive should be created
            dump_time (datetime): the time at which the data dump was started
            threads (int): the number of threads to user for compression
            spark_format (bool): dump files in Apache Spark friendly format if True, else full dumps

        Returns:
            the path to the dump archive
        """

        self.log.info('Beginning dump of listens from InfluxDB...')

        self.log.info(
            'Getting list of users whose listens are to be dumped...')
        users = db_user.get_all_users(columns=['id', 'musicbrainz_id'])
        self.log.info('Total number of users: %d', len(users))

        archive_name = 'listenbrainz-listens-dump-{time}'.format(
            time=dump_time.strftime('%Y%m%d-%H%M%S'))
        if spark_format:
            archive_name = '{}-spark'.format(archive_name)
        archive_path = os.path.join(
            location, '{filename}.tar.xz'.format(filename=archive_name))
        with open(archive_path, 'w') as archive:

            pxz_command = [
                'pxz', '--compress', '-T{threads}'.format(threads=threads)
            ]
            pxz = subprocess.Popen(pxz_command,
                                   stdin=subprocess.PIPE,
                                   stdout=archive)

            with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

                temp_dir = tempfile.mkdtemp()
                self.write_dump_metadata(archive_name, dump_time, temp_dir,
                                         tar)

                listens_path = os.path.join(temp_dir, 'listens')
                if spark_format:
                    self.write_listens_for_spark(listens_path, users,
                                                 dump_time)
                else:
                    index = self.write_listens_full(listens_path, users,
                                                    dump_time)
                    self.write_dump_index_file(index, temp_dir, tar,
                                               archive_name)

                # add the listens directory to the archive
                self.log.info('Got all listens, adding them to the archive...')
                tar.add(listens_path,
                        arcname=os.path.join(archive_name, 'listens'))

                # remove the temporary directory
                shutil.rmtree(temp_dir)

            pxz.stdin.close()

        pxz.wait()
        self.log.info('ListenBrainz listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path

Example #12

0

Show file

File: influx_listenstore.py Project: shivam-tripathi/listenbrainz-server

    def dump_listens(self,
                     location,
                     dump_time=datetime.today(),
                     threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Dumps all listens in the ListenStore into a .tar.xz archive.

        Files are created with UUIDs as names. Each file can contain listens for a number of users.
        An index.json file is used to save which file contains the listens of which users.

        Args:
            location: the directory where the listens dump archive should be created
            dump_time (datetime): the time at which the data dump was started
            threads (int): the number of threads to user for compression

        Returns:
            the path to the dump archive
        """

        self.log.info('Beginning dump of listens from InfluxDB...')

        self.log.info(
            'Getting list of users whose listens are to be dumped...')
        users = db_user.get_all_users(columns=['id', 'musicbrainz_id'])
        self.log.info('Total number of users: %d', len(users))

        archive_name = 'listenbrainz-listens-dump-{time}'.format(
            time=dump_time.strftime('%Y%m%d-%H%M%S'))
        archive_path = os.path.join(
            location, '{filename}.tar.xz'.format(filename=archive_name))
        with open(archive_path, 'w') as archive:

            pxz_command = [
                'pxz', '--compress', '-T{threads}'.format(threads=threads)
            ]
            pxz = subprocess.Popen(pxz_command,
                                   stdin=subprocess.PIPE,
                                   stdout=archive)

            with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

                temp_dir = tempfile.mkdtemp()

                try:
                    # add timestamp
                    timestamp_path = os.path.join(temp_dir, 'TIMESTAMP')
                    with open(timestamp_path, 'w') as f:
                        f.write(dump_time.isoformat(' '))
                    tar.add(timestamp_path,
                            arcname=os.path.join(archive_name, 'TIMESTAMP'))

                    # add schema version
                    schema_version_path = os.path.join(temp_dir,
                                                       'SCHEMA_SEQUENCE')
                    with open(schema_version_path, 'w') as f:
                        f.write(str(LISTENS_DUMP_SCHEMA_VERSION))
                    tar.add(schema_version_path,
                            arcname=os.path.join(archive_name,
                                                 'SCHEMA_SEQUENCE'))

                    # add copyright notice
                    tar.add(DUMP_LICENSE_FILE_PATH,
                            arcname=os.path.join(archive_name, 'COPYING'))

                except IOError as e:
                    log_ioerrors(self.log, e)
                    raise
                except Exception as e:
                    self.log.error('Exception while adding dump metadata: %s',
                                   str(e))
                    raise

                listens_path = os.path.join(temp_dir, 'listens')

                dump_complete = False
                next_user_id = 0
                index = {}
                while not dump_complete:
                    file_name = str(uuid.uuid4())
                    # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid)
                    directory = os.path.join(listens_path, file_name[0],
                                             file_name[0:2])
                    create_path(directory)
                    file_path = os.path.join(
                        directory, '{uuid}.listens'.format(uuid=file_name))
                    with open(file_path, 'w') as f:
                        file_done = False
                        while next_user_id < len(users):
                            if f.tell() > DUMP_FILE_SIZE_LIMIT:
                                file_done = True
                                break

                            username = users[next_user_id]['musicbrainz_id']
                            offset = f.tell()
                            size = self.dump_user(username=username,
                                                  fileobj=f,
                                                  dump_time=dump_time)
                            index[username] = {
                                'file_name': file_name,
                                'offset': offset,
                                'size': size,
                            }
                            next_user_id += 1

                        if file_done:
                            continue

                        if next_user_id == len(users):
                            dump_complete = True
                            break

                # add the listens directory to the archive
                self.log.info('Got all listens, adding them to the archive...')
                tar.add(listens_path,
                        arcname=os.path.join(archive_name, 'listens'))

                # add index.json file to the archive
                try:
                    index_path = os.path.join(temp_dir, 'index.json')
                    with open(index_path, 'w') as f:
                        f.write(ujson.dumps(index))
                    tar.add(index_path,
                            arcname=os.path.join(archive_name, 'index.json'))
                except IOError as e:
                    log_ioerrors(self.log, e)
                    raise
                except Exception as e:
                    self.log.error(
                        'Exception while adding index file to archive: %s',
                        str(e))
                    raise

                # remove the temporary directory
                shutil.rmtree(temp_dir)

            pxz.stdin.close()

        self.log.info('ListenBrainz listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path

Example #13

0

Show file

File: influx_listenstore.py Project: Uditgulati/listenbrainz-server

    def dump_listens(self, location, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Dumps all listens in the ListenStore into a .tar.xz archive.

        Files are created with UUIDs as names. Each file can contain listens for a number of users.
        An index.json file is used to save which file contains the listens of which users.

        Args:
            location: the directory where the listens dump archive should be created
            dump_time (datetime): the time at which the data dump was started
            threads (int): the number of threads to user for compression

        Returns:
            the path to the dump archive
        """

        self.log.info('Beginning dump of listens from InfluxDB...')

        self.log.info('Getting list of users whose listens are to be dumped...')
        users = db_user.get_all_users(columns=['id', 'musicbrainz_id'])
        self.log.info('Total number of users: %d', len(users))

        archive_name = 'listenbrainz-listens-dump-{time}'.format(time=dump_time.strftime('%Y%m%d-%H%M%S'))
        archive_path = os.path.join(location, '{filename}.tar.xz'.format(filename=archive_name))
        with open(archive_path, 'w') as archive:

            pxz_command = ['pxz', '--compress', '-T{threads}'.format(threads=threads)]
            pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive)

            with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

                temp_dir = tempfile.mkdtemp()

                try:
                    # add timestamp
                    timestamp_path = os.path.join(temp_dir, 'TIMESTAMP')
                    with open(timestamp_path, 'w') as f:
                        f.write(dump_time.isoformat(' '))
                    tar.add(timestamp_path,
                            arcname=os.path.join(archive_name, 'TIMESTAMP'))

                    # add schema version
                    schema_version_path = os.path.join(temp_dir, 'SCHEMA_SEQUENCE')
                    with open(schema_version_path, 'w') as f:
                        f.write(str(LISTENS_DUMP_SCHEMA_VERSION))
                    tar.add(schema_version_path,
                            arcname=os.path.join(archive_name, 'SCHEMA_SEQUENCE'))

                    # add copyright notice
                    tar.add(DUMP_LICENSE_FILE_PATH,
                            arcname=os.path.join(archive_name, 'COPYING'))

                except IOError as e:
                    self.log.critical('IOError while writing metadata dump files: %s', str(e), exc_info=True)
                    raise
                except Exception as e:
                    self.log.error('Exception while adding dump metadata: %s', str(e), exc_info=True)
                    raise

                listens_path = os.path.join(temp_dir, 'listens')

                dump_complete = False
                next_user_id = 0
                index = {}
                while not dump_complete:
                    file_name = str(uuid.uuid4())
                    # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid)
                    directory = os.path.join(listens_path, file_name[0], file_name[0:2])
                    create_path(directory)
                    file_path = os.path.join(directory, '{uuid}.listens'.format(uuid=file_name))
                    with open(file_path, 'w') as f:
                        file_done = False
                        while next_user_id < len(users):
                            if f.tell() > DUMP_FILE_SIZE_LIMIT:
                                file_done = True
                                break

                            username = users[next_user_id]['musicbrainz_id']
                            offset = f.tell()
                            size = self.dump_user(username=username, fileobj=f, dump_time=dump_time)
                            index[username] = {
                                'file_name': file_name,
                                'offset': offset,
                                'size': size,
                            }
                            next_user_id += 1

                        if file_done:
                            continue

                        if next_user_id == len(users):
                            dump_complete = True
                            break


                # add the listens directory to the archive
                self.log.info('Got all listens, adding them to the archive...')
                tar.add(listens_path,
                        arcname=os.path.join(archive_name, 'listens'))

                # add index.json file to the archive
                try:
                    index_path = os.path.join(temp_dir, 'index.json')
                    with open(index_path, 'w') as f:
                        f.write(ujson.dumps(index))
                    tar.add(index_path,
                            arcname=os.path.join(archive_name, 'index.json'))
                except IOError as e:
                    self.log.critical('IOError while writing index.json to archive: %s', str(e), exc_info=True)
                    raise
                except Exception as e:
                    self.log.error('Exception while adding index file to archive: %s', str(e), exc_info=True)
                    raise

                # remove the temporary directory
                shutil.rmtree(temp_dir)

            pxz.stdin.close()

        pxz.wait()
        self.log.info('ListenBrainz listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path

Example #14

0

Show file

    def dump_listens(self,
                     location,
                     dump_id,
                     start_time=datetime.utcfromtimestamp(0),
                     end_time=None,
                     threads=DUMP_DEFAULT_THREAD_COUNT,
                     spark_format=False):
        """ Dumps all listens in the ListenStore into a .tar.xz archive.

        Files are created with UUIDs as names. Each file can contain listens for a number of users.
        An index.json file is used to save which file contains the listens of which users.

        This creates an incremental dump if start_time is specified (with range start_time to end_time),
        otherwise it creates a full dump with all listens.

        Args:
            location: the directory where the listens dump archive should be created
            dump_id (int): the ID of the dump in the dump sequence
            start_time and end_time (datetime): the time range for which listens should be dumped
                start_time defaults to utc 0 (meaning a full dump) and end_time defaults to the current time
            threads (int): the number of threads to user for compression
            spark_format (bool): dump files in Apache Spark friendly format if True, else full dumps

        Returns:
            the path to the dump archive
        """

        if end_time is None:
            end_time = datetime.now()

        self.log.info('Beginning dump of listens from InfluxDB...')

        self.log.info(
            'Getting list of users whose listens are to be dumped...')
        users = db_user.get_all_users(columns=['id', 'musicbrainz_id'],
                                      created_before=end_time)
        self.log.info('Total number of users: %d', len(users))

        if start_time == datetime.utcfromtimestamp(0):
            full_dump = True
        else:
            full_dump = False

        archive_name = 'listenbrainz-listens-dump-{dump_id}-{time}'.format(
            dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
        if spark_format:
            archive_name = '{}-spark'.format(archive_name)

        if full_dump:
            archive_name = '{}-full'.format(archive_name)
        else:
            archive_name = '{}-incremental'.format(archive_name)
        archive_path = os.path.join(
            location, '{filename}.tar.xz'.format(filename=archive_name))
        with open(archive_path, 'w') as archive:

            pxz_command = [
                'pxz', '--compress', '-T{threads}'.format(threads=threads)
            ]
            pxz = subprocess.Popen(pxz_command,
                                   stdin=subprocess.PIPE,
                                   stdout=archive)

            with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

                temp_dir = os.path.join(self.dump_temp_dir_root,
                                        str(uuid.uuid4()))
                create_path(temp_dir)
                self.write_dump_metadata(archive_name, start_time, end_time,
                                         temp_dir, tar, full_dump)

                listens_path = os.path.join(temp_dir, 'listens')
                if spark_format:
                    self.write_listens_for_spark(listens_path, users,
                                                 start_time, end_time)
                    tar.add(listens_path,
                            arcname=os.path.join(archive_name, 'listens'))
                else:
                    index = self.write_listens_to_dump(listens_path, users,
                                                       tar, archive_name,
                                                       start_time, end_time)
                    self.write_dump_index_file(index, temp_dir, tar,
                                               archive_name)

                # remove the temporary directory
                shutil.rmtree(temp_dir)

            pxz.stdin.close()

        pxz.wait()
        self.log.info('ListenBrainz listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path

Example #15

0

Show file

    def dump_listens(self, location, dump_time=datetime.today(), threads=None):
        """ Fetches listens of each user from her measurement and dumps them into a file.
            These files are compressed into an archive.

        Args:
            location: the directory where the listens dump archive should be created
            dump_time (datetime): the time at which the data dump was started
            threads (int): the number of threads to user for compression

        Returns:
            the path to the dump archive
        """

        self.log.info('Beginning dump of listens from InfluxDB...')

        self.log.info(
            'Getting list of users whose listens are to be dumped...')
        users = db_user.get_all_users()
        self.log.info('Total number of users: %d', len(users))

        archive_name = 'listenbrainz-listens-dump-{time}'.format(
            time=dump_time.strftime('%Y%m%d-%H%M%S'))
        archive_path = os.path.join(
            location, '{filename}.tar.xz'.format(filename=archive_name))
        with open(archive_path, 'w') as archive:

            pxz_command = ['pxz', '--compress']
            if threads is not None:
                pxz_command.append('-T {threads}'.format(threads=threads))

            pxz = subprocess.Popen(pxz_command,
                                   stdin=subprocess.PIPE,
                                   stdout=archive)

            with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

                temp_dir = tempfile.mkdtemp()

                try:
                    # add timestamp
                    timestamp_path = os.path.join(temp_dir, 'TIMESTAMP')
                    with open(timestamp_path, 'w') as f:
                        f.write(dump_time.isoformat(' '))
                    tar.add(timestamp_path,
                            arcname=os.path.join(archive_name, 'TIMESTAMP'))

                    # add schema version
                    schema_version_path = os.path.join(temp_dir,
                                                       'SCHEMA_SEQUENCE')
                    with open(schema_version_path, 'w') as f:
                        f.write(str(LISTENS_DUMP_SCHEMA_VERSION))
                    tar.add(schema_version_path,
                            arcname=os.path.join(archive_name,
                                                 'SCHEMA_SEQUENCE'))

                    # add copyright notice
                    tar.add(DUMP_LICENSE_FILE_PATH,
                            arcname=os.path.join(archive_name, 'COPYING'))

                except IOError as e:
                    log_ioerrors(self.log, e)
                    raise
                except Exception as e:
                    self.log.error('Exception while adding dump metadata: %s',
                                   str(e))
                    raise

                listens_path = os.path.join(temp_dir, 'listens')
                create_path(listens_path)

                # get listens from all measurements and write them to files in
                # a temporary dir before adding them to the archive
                for user in users:
                    username = user['musicbrainz_id']
                    offset = 0

                    user_listens_file = '{username}.listens'.format(
                        username=username)
                    user_listens_path = os.path.join(listens_path,
                                                     user_listens_file)

                    with open(user_listens_path, 'w') as f:
                        # Get this user's listens in chunks
                        while True:

                            # loop until we get this chunk of listens
                            while True:
                                try:
                                    result = self.influx.query("""
                                        SELECT *
                                          FROM {measurement}
                                         WHERE time <= {timestamp}
                                      ORDER BY time DESC
                                         LIMIT {limit}
                                        OFFSET {offset}
                                    """.format(
                                        measurement=
                                        get_escaped_measurement_name(username),
                                        timestamp=get_influx_query_timestamp(
                                            dump_time.strftime('%s')),
                                        limit=DUMP_CHUNK_SIZE,
                                        offset=offset,
                                    ))
                                    break
                                except Exception as e:
                                    self.log.error(
                                        'Error while getting listens for user %s',
                                        user['musicbrainz_id'])
                                    self.log.error(str(e))
                                    time.sleep(3)

                            rows = list(
                                result.get_points(
                                    get_measurement_name(username)))
                            if not rows:
                                break

                            for row in rows:
                                listen = Listen.from_influx(row).to_api()
                                try:
                                    f.write(ujson.dumps(listen))
                                    f.write('\n')
                                except IOError as e:
                                    log_ioerrors(self.log, e)
                                    raise
                                except Exception as e:
                                    self.log.error(
                                        'Exception while creating json for user: %s',
                                        user['musicbrainz_id'])
                                    self.log.error(str(e))
                                    raise

                            offset += DUMP_CHUNK_SIZE

                # add the listens directory to the archive
                self.log.info('Got all listens, adding them to the archive...')
                tar.add(listens_path,
                        arcname=os.path.join(archive_name, 'listens'))

                # remove the temporary directory
                shutil.rmtree(temp_dir)

            pxz.stdin.close()

        self.log.info('ListenBrainz listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path