def test_get_all_users(self): """ Tests that get_all_users returns ALL users in the db """ users = db_user.get_all_users() self.assertEqual(len(users), 0) db_user.create(8, 'user1') users = db_user.get_all_users() self.assertEqual(len(users), 1) db_user.create(9, 'user2') users = db_user.get_all_users() self.assertEqual(len(users), 2)
def push_users_to_queue(channel, force=False): """ Get users from the db whose stats haven't been calculated and push them into the queue. Args: channel: the RabbitMQ channel in which we should publish the user data """ logger.info('pushing users to stats calculation queue...') if force: users = db_user.get_all_users() else: users = db_user.get_users_with_uncalculated_stats() for user in users: data = { 'type': 'user', 'id': user['id'], 'musicbrainz_id': user['musicbrainz_id'] } channel.basic_publish( exchange=config.BIGQUERY_EXCHANGE, routing_key='', body=ujson.dumps(data), properties=pika.BasicProperties(delivery_mode=2, ), ) logger.info('pushed %d users!', len(users))
def push_users_to_queue(channel, force=False): """ Get users from the db whose stats haven't been calculated and push them into the queue. Args: channel: the RabbitMQ channel in which we should publish the user data """ logger.info('pushing users to stats calculation queue...') if force: users = db_user.get_all_users() else: users = db_user.get_users_with_uncalculated_stats() for user in users: data = { 'type': 'user', 'id': user['id'], 'musicbrainz_id': user['musicbrainz_id'] } channel.basic_publish( exchange=config.BIGQUERY_EXCHANGE, routing_key='', body=ujson.dumps(data), properties=pika.BasicProperties(delivery_mode = 2,), ) logger.info('pushed %d users!', len(users))
def find_users(self): with self.app.app_context(): self.ls = init_influx_connection( current_app.logger, { 'REDIS_HOST': current_app.config['REDIS_HOST'], 'REDIS_PORT': current_app.config['REDIS_PORT'], 'REDIS_NAMESPACE': current_app.config['REDIS_NAMESPACE'], 'INFLUX_HOST': current_app.config['INFLUX_HOST'], 'INFLUX_PORT': current_app.config['INFLUX_PORT'], 'INFLUX_DB_NAME': current_app.config['INFLUX_DB_NAME'], }) for _ in range(CONNECTION_RETRY_COUNT): try: users = db_user.get_all_users() break except DatabaseError as e: current_app.logger.error( 'Error while getting users list: %s', str(e), exc_info=True) time.sleep(1) else: current_app.logger.critical( "Cannot connect to PostgreSQL, exiting...") raise DatabaseError("Cannot connect to PostgreSQL, exiting") return [ user['musicbrainz_id'] for user in users if self.condition(user['musicbrainz_id']) ]
def test_get_all_users_columns(self): """ Tests that get_all_users only returns those columns which are asked for """ # check that all columns of the user table are present # if columns is not specified users = db_user.get_all_users() for user in users: for column in db_user.USER_GET_COLUMNS: self.assertIn(column, user) # check that only id is present if columns = ['id'] users = db_user.get_all_users(columns=['id']) for user in users: self.assertIn('id', user) for column in db_user.USER_GET_COLUMNS: if column != 'id': self.assertNotIn(column, user)
def import_musicbrainz_rows(musicbrainz_db_uri, dry_run=True, delete=False): musicbrainz_db.init_db_engine(musicbrainz_db_uri) db.init_db_connection(app.config['SQLALCHEMY_DATABASE_URI']) import_count = 0 already_imported = 0 not_found = 0 deleted = 0 if not dry_run: update_row_ids_for_exceptions() users = db_user.get_all_users() with musicbrainz_db.engine.connect() as mb_connection: with db.engine.connect() as connection: for user in users: if user.get('musicbrainz_row_id') is not None: already_imported += 1 continue name = user['musicbrainz_id'] result = mb_connection.execute(sqlalchemy.text(""" SELECT id FROM editor WHERE LOWER(name) = LOWER(:name) """), { 'name': name, }) musicbrainz_row_id = None if result.rowcount > 0: musicbrainz_row_id = result.fetchone()['id'] import_count += 1 else: print('No user with specified username in the MusicBrainz db: %s' % name) if delete: print('Deleting user %s' % name) try: delete_user(user) except NotFound: print('User %s not found in LB...' % name) not_found += 1 continue if not dry_run: connection.execute(sqlalchemy.text(""" UPDATE "user" SET musicbrainz_row_id = :musicbrainz_row_id WHERE id = :id """), { 'musicbrainz_row_id': musicbrainz_row_id, 'id': user['id'], }) print('Inserted row_id %d for user %s' % (musicbrainz_row_id, name)) print('Total number of ListenBrainz users: %d' % len(users)) print('Total number of ListenBrainz users with already imported row ids: %d' % already_imported) print('Total number of ListenBrainz users whose row ids can be imported: %d' % import_count) print('Total number of ListenBrainz users not found in MusicBrainz: %d' % not_found) print('Total number of ListenBrainz users deleted from MusicBrainz: %d' % deleted)
def test_import_dump_many_users(self): for i in range(50): db_user.create('user%d' % i) temp_dir = tempfile.mkdtemp() dump_location = self.logstore.dump_listens(location=temp_dir, ) sleep(1) self.assertTrue(os.path.isfile(dump_location)) self.reset_influx_db() done = self.logstore.import_listens_dump(dump_location) sleep(1) self.assertEqual(done, len(db_user.get_all_users()))
def test_import_dump_many_users(self): for i in range(2, 52): db_user.create(i, 'user%d' % i) temp_dir = tempfile.mkdtemp() dump_location = self.logstore.dump_listens( location=temp_dir, ) sleep(1) self.assertTrue(os.path.isfile(dump_location)) self.reset_influx_db() done = self.logstore.import_listens_dump(dump_location) sleep(1) self.assertEqual(done, len(db_user.get_all_users()))
def test_import_dump_many_users(self): for i in range(2, 52): db_user.create(i, 'user%d' % i) temp_dir = tempfile.mkdtemp() dump_location = self.logstore.dump_listens( location=temp_dir, dump_id=1, end_time=datetime.now(), ) sleep(1) self.assertTrue(os.path.isfile(dump_location)) self.reset_influx_db() done = self.logstore.import_listens_dump(dump_location) sleep(1) self.assertEqual(done, len(db_user.get_all_users())) shutil.rmtree(temp_dir)
def import_musicbrainz_rows(musicbrainz_db_uri, dry_run=True, delete=False): musicbrainz_db.init_db_engine(musicbrainz_db_uri) db.init_db_connection(app.config['SQLALCHEMY_DATABASE_URI']) import_count = 0 already_imported = 0 not_found = 0 deleted = 0 if not dry_run: update_row_ids_for_exceptions() users = db_user.get_all_users() with musicbrainz_db.engine.connect() as mb_connection: with db.engine.connect() as connection: for user in users: if user.get('musicbrainz_row_id') is not None: already_imported += 1 continue name = user['musicbrainz_id'] result = mb_connection.execute( sqlalchemy.text(""" SELECT id FROM editor WHERE LOWER(name) = LOWER(:name) """), { 'name': name, }) musicbrainz_row_id = None if result.rowcount > 0: musicbrainz_row_id = result.fetchone()['id'] import_count += 1 else: print( 'No user with specified username in the MusicBrainz db: %s' % name) if delete: print('Deleting user %s' % name) try: delete_user(user) except NotFound: print('User %s not found in LB...' % name) not_found += 1 continue if not dry_run: connection.execute( sqlalchemy.text(""" UPDATE "user" SET musicbrainz_row_id = :musicbrainz_row_id WHERE id = :id """), { 'musicbrainz_row_id': musicbrainz_row_id, 'id': user['id'], }) print('Inserted row_id %d for user %s' % (musicbrainz_row_id, name)) print('Total number of ListenBrainz users: %d' % len(users)) print( 'Total number of ListenBrainz users with already imported row ids: %d' % already_imported) print( 'Total number of ListenBrainz users whose row ids can be imported: %d' % import_count) print('Total number of ListenBrainz users not found in MusicBrainz: %d' % not_found) print('Total number of ListenBrainz users deleted from MusicBrainz: %d' % deleted)
def dump_listens(self, location, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT, spark_format=False): """ Dumps all listens in the ListenStore into a .tar.xz archive. Files are created with UUIDs as names. Each file can contain listens for a number of users. An index.json file is used to save which file contains the listens of which users. Args: location: the directory where the listens dump archive should be created dump_time (datetime): the time at which the data dump was started threads (int): the number of threads to user for compression spark_format (bool): dump files in Apache Spark friendly format if True, else full dumps Returns: the path to the dump archive """ self.log.info('Beginning dump of listens from InfluxDB...') self.log.info( 'Getting list of users whose listens are to be dumped...') users = db_user.get_all_users(columns=['id', 'musicbrainz_id']) self.log.info('Total number of users: %d', len(users)) archive_name = 'listenbrainz-listens-dump-{time}'.format( time=dump_time.strftime('%Y%m%d-%H%M%S')) if spark_format: archive_name = '{}-spark'.format(archive_name) archive_path = os.path.join( location, '{filename}.tar.xz'.format(filename=archive_name)) with open(archive_path, 'w') as archive: pxz_command = [ 'pxz', '--compress', '-T{threads}'.format(threads=threads) ] pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = tempfile.mkdtemp() self.write_dump_metadata(archive_name, dump_time, temp_dir, tar) listens_path = os.path.join(temp_dir, 'listens') if spark_format: self.write_listens_for_spark(listens_path, users, dump_time) else: index = self.write_listens_full(listens_path, users, dump_time) self.write_dump_index_file(index, temp_dir, tar, archive_name) # add the listens directory to the archive self.log.info('Got all listens, adding them to the archive...') tar.add(listens_path, arcname=os.path.join(archive_name, 'listens')) # remove the temporary directory shutil.rmtree(temp_dir) pxz.stdin.close() pxz.wait() self.log.info('ListenBrainz listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path
def dump_listens(self, location, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT): """ Dumps all listens in the ListenStore into a .tar.xz archive. Files are created with UUIDs as names. Each file can contain listens for a number of users. An index.json file is used to save which file contains the listens of which users. Args: location: the directory where the listens dump archive should be created dump_time (datetime): the time at which the data dump was started threads (int): the number of threads to user for compression Returns: the path to the dump archive """ self.log.info('Beginning dump of listens from InfluxDB...') self.log.info( 'Getting list of users whose listens are to be dumped...') users = db_user.get_all_users(columns=['id', 'musicbrainz_id']) self.log.info('Total number of users: %d', len(users)) archive_name = 'listenbrainz-listens-dump-{time}'.format( time=dump_time.strftime('%Y%m%d-%H%M%S')) archive_path = os.path.join( location, '{filename}.tar.xz'.format(filename=archive_name)) with open(archive_path, 'w') as archive: pxz_command = [ 'pxz', '--compress', '-T{threads}'.format(threads=threads) ] pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = tempfile.mkdtemp() try: # add timestamp timestamp_path = os.path.join(temp_dir, 'TIMESTAMP') with open(timestamp_path, 'w') as f: f.write(dump_time.isoformat(' ')) tar.add(timestamp_path, arcname=os.path.join(archive_name, 'TIMESTAMP')) # add schema version schema_version_path = os.path.join(temp_dir, 'SCHEMA_SEQUENCE') with open(schema_version_path, 'w') as f: f.write(str(LISTENS_DUMP_SCHEMA_VERSION)) tar.add(schema_version_path, arcname=os.path.join(archive_name, 'SCHEMA_SEQUENCE')) # add copyright notice tar.add(DUMP_LICENSE_FILE_PATH, arcname=os.path.join(archive_name, 'COPYING')) except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error('Exception while adding dump metadata: %s', str(e)) raise listens_path = os.path.join(temp_dir, 'listens') dump_complete = False next_user_id = 0 index = {} while not dump_complete: file_name = str(uuid.uuid4()) # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid) directory = os.path.join(listens_path, file_name[0], file_name[0:2]) create_path(directory) file_path = os.path.join( directory, '{uuid}.listens'.format(uuid=file_name)) with open(file_path, 'w') as f: file_done = False while next_user_id < len(users): if f.tell() > DUMP_FILE_SIZE_LIMIT: file_done = True break username = users[next_user_id]['musicbrainz_id'] offset = f.tell() size = self.dump_user(username=username, fileobj=f, dump_time=dump_time) index[username] = { 'file_name': file_name, 'offset': offset, 'size': size, } next_user_id += 1 if file_done: continue if next_user_id == len(users): dump_complete = True break # add the listens directory to the archive self.log.info('Got all listens, adding them to the archive...') tar.add(listens_path, arcname=os.path.join(archive_name, 'listens')) # add index.json file to the archive try: index_path = os.path.join(temp_dir, 'index.json') with open(index_path, 'w') as f: f.write(ujson.dumps(index)) tar.add(index_path, arcname=os.path.join(archive_name, 'index.json')) except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error( 'Exception while adding index file to archive: %s', str(e)) raise # remove the temporary directory shutil.rmtree(temp_dir) pxz.stdin.close() self.log.info('ListenBrainz listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path
def dump_listens(self, location, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT): """ Dumps all listens in the ListenStore into a .tar.xz archive. Files are created with UUIDs as names. Each file can contain listens for a number of users. An index.json file is used to save which file contains the listens of which users. Args: location: the directory where the listens dump archive should be created dump_time (datetime): the time at which the data dump was started threads (int): the number of threads to user for compression Returns: the path to the dump archive """ self.log.info('Beginning dump of listens from InfluxDB...') self.log.info('Getting list of users whose listens are to be dumped...') users = db_user.get_all_users(columns=['id', 'musicbrainz_id']) self.log.info('Total number of users: %d', len(users)) archive_name = 'listenbrainz-listens-dump-{time}'.format(time=dump_time.strftime('%Y%m%d-%H%M%S')) archive_path = os.path.join(location, '{filename}.tar.xz'.format(filename=archive_name)) with open(archive_path, 'w') as archive: pxz_command = ['pxz', '--compress', '-T{threads}'.format(threads=threads)] pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = tempfile.mkdtemp() try: # add timestamp timestamp_path = os.path.join(temp_dir, 'TIMESTAMP') with open(timestamp_path, 'w') as f: f.write(dump_time.isoformat(' ')) tar.add(timestamp_path, arcname=os.path.join(archive_name, 'TIMESTAMP')) # add schema version schema_version_path = os.path.join(temp_dir, 'SCHEMA_SEQUENCE') with open(schema_version_path, 'w') as f: f.write(str(LISTENS_DUMP_SCHEMA_VERSION)) tar.add(schema_version_path, arcname=os.path.join(archive_name, 'SCHEMA_SEQUENCE')) # add copyright notice tar.add(DUMP_LICENSE_FILE_PATH, arcname=os.path.join(archive_name, 'COPYING')) except IOError as e: self.log.critical('IOError while writing metadata dump files: %s', str(e), exc_info=True) raise except Exception as e: self.log.error('Exception while adding dump metadata: %s', str(e), exc_info=True) raise listens_path = os.path.join(temp_dir, 'listens') dump_complete = False next_user_id = 0 index = {} while not dump_complete: file_name = str(uuid.uuid4()) # directory structure of the form "/%s/%02s/%s.listens" % (uuid[0], uuid[0:2], uuid) directory = os.path.join(listens_path, file_name[0], file_name[0:2]) create_path(directory) file_path = os.path.join(directory, '{uuid}.listens'.format(uuid=file_name)) with open(file_path, 'w') as f: file_done = False while next_user_id < len(users): if f.tell() > DUMP_FILE_SIZE_LIMIT: file_done = True break username = users[next_user_id]['musicbrainz_id'] offset = f.tell() size = self.dump_user(username=username, fileobj=f, dump_time=dump_time) index[username] = { 'file_name': file_name, 'offset': offset, 'size': size, } next_user_id += 1 if file_done: continue if next_user_id == len(users): dump_complete = True break # add the listens directory to the archive self.log.info('Got all listens, adding them to the archive...') tar.add(listens_path, arcname=os.path.join(archive_name, 'listens')) # add index.json file to the archive try: index_path = os.path.join(temp_dir, 'index.json') with open(index_path, 'w') as f: f.write(ujson.dumps(index)) tar.add(index_path, arcname=os.path.join(archive_name, 'index.json')) except IOError as e: self.log.critical('IOError while writing index.json to archive: %s', str(e), exc_info=True) raise except Exception as e: self.log.error('Exception while adding index file to archive: %s', str(e), exc_info=True) raise # remove the temporary directory shutil.rmtree(temp_dir) pxz.stdin.close() pxz.wait() self.log.info('ListenBrainz listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path
def dump_listens(self, location, dump_id, start_time=datetime.utcfromtimestamp(0), end_time=None, threads=DUMP_DEFAULT_THREAD_COUNT, spark_format=False): """ Dumps all listens in the ListenStore into a .tar.xz archive. Files are created with UUIDs as names. Each file can contain listens for a number of users. An index.json file is used to save which file contains the listens of which users. This creates an incremental dump if start_time is specified (with range start_time to end_time), otherwise it creates a full dump with all listens. Args: location: the directory where the listens dump archive should be created dump_id (int): the ID of the dump in the dump sequence start_time and end_time (datetime): the time range for which listens should be dumped start_time defaults to utc 0 (meaning a full dump) and end_time defaults to the current time threads (int): the number of threads to user for compression spark_format (bool): dump files in Apache Spark friendly format if True, else full dumps Returns: the path to the dump archive """ if end_time is None: end_time = datetime.now() self.log.info('Beginning dump of listens from InfluxDB...') self.log.info( 'Getting list of users whose listens are to be dumped...') users = db_user.get_all_users(columns=['id', 'musicbrainz_id'], created_before=end_time) self.log.info('Total number of users: %d', len(users)) if start_time == datetime.utcfromtimestamp(0): full_dump = True else: full_dump = False archive_name = 'listenbrainz-listens-dump-{dump_id}-{time}'.format( dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S')) if spark_format: archive_name = '{}-spark'.format(archive_name) if full_dump: archive_name = '{}-full'.format(archive_name) else: archive_name = '{}-incremental'.format(archive_name) archive_path = os.path.join( location, '{filename}.tar.xz'.format(filename=archive_name)) with open(archive_path, 'w') as archive: pxz_command = [ 'pxz', '--compress', '-T{threads}'.format(threads=threads) ] pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = os.path.join(self.dump_temp_dir_root, str(uuid.uuid4())) create_path(temp_dir) self.write_dump_metadata(archive_name, start_time, end_time, temp_dir, tar, full_dump) listens_path = os.path.join(temp_dir, 'listens') if spark_format: self.write_listens_for_spark(listens_path, users, start_time, end_time) tar.add(listens_path, arcname=os.path.join(archive_name, 'listens')) else: index = self.write_listens_to_dump(listens_path, users, tar, archive_name, start_time, end_time) self.write_dump_index_file(index, temp_dir, tar, archive_name) # remove the temporary directory shutil.rmtree(temp_dir) pxz.stdin.close() pxz.wait() self.log.info('ListenBrainz listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path
def dump_listens(self, location, dump_time=datetime.today(), threads=None): """ Fetches listens of each user from her measurement and dumps them into a file. These files are compressed into an archive. Args: location: the directory where the listens dump archive should be created dump_time (datetime): the time at which the data dump was started threads (int): the number of threads to user for compression Returns: the path to the dump archive """ self.log.info('Beginning dump of listens from InfluxDB...') self.log.info( 'Getting list of users whose listens are to be dumped...') users = db_user.get_all_users() self.log.info('Total number of users: %d', len(users)) archive_name = 'listenbrainz-listens-dump-{time}'.format( time=dump_time.strftime('%Y%m%d-%H%M%S')) archive_path = os.path.join( location, '{filename}.tar.xz'.format(filename=archive_name)) with open(archive_path, 'w') as archive: pxz_command = ['pxz', '--compress'] if threads is not None: pxz_command.append('-T {threads}'.format(threads=threads)) pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = tempfile.mkdtemp() try: # add timestamp timestamp_path = os.path.join(temp_dir, 'TIMESTAMP') with open(timestamp_path, 'w') as f: f.write(dump_time.isoformat(' ')) tar.add(timestamp_path, arcname=os.path.join(archive_name, 'TIMESTAMP')) # add schema version schema_version_path = os.path.join(temp_dir, 'SCHEMA_SEQUENCE') with open(schema_version_path, 'w') as f: f.write(str(LISTENS_DUMP_SCHEMA_VERSION)) tar.add(schema_version_path, arcname=os.path.join(archive_name, 'SCHEMA_SEQUENCE')) # add copyright notice tar.add(DUMP_LICENSE_FILE_PATH, arcname=os.path.join(archive_name, 'COPYING')) except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error('Exception while adding dump metadata: %s', str(e)) raise listens_path = os.path.join(temp_dir, 'listens') create_path(listens_path) # get listens from all measurements and write them to files in # a temporary dir before adding them to the archive for user in users: username = user['musicbrainz_id'] offset = 0 user_listens_file = '{username}.listens'.format( username=username) user_listens_path = os.path.join(listens_path, user_listens_file) with open(user_listens_path, 'w') as f: # Get this user's listens in chunks while True: # loop until we get this chunk of listens while True: try: result = self.influx.query(""" SELECT * FROM {measurement} WHERE time <= {timestamp} ORDER BY time DESC LIMIT {limit} OFFSET {offset} """.format( measurement= get_escaped_measurement_name(username), timestamp=get_influx_query_timestamp( dump_time.strftime('%s')), limit=DUMP_CHUNK_SIZE, offset=offset, )) break except Exception as e: self.log.error( 'Error while getting listens for user %s', user['musicbrainz_id']) self.log.error(str(e)) time.sleep(3) rows = list( result.get_points( get_measurement_name(username))) if not rows: break for row in rows: listen = Listen.from_influx(row).to_api() try: f.write(ujson.dumps(listen)) f.write('\n') except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error( 'Exception while creating json for user: %s', user['musicbrainz_id']) self.log.error(str(e)) raise offset += DUMP_CHUNK_SIZE # add the listens directory to the archive self.log.info('Got all listens, adding them to the archive...') tar.add(listens_path, arcname=os.path.join(archive_name, 'listens')) # remove the temporary directory shutil.rmtree(temp_dir) pxz.stdin.close() self.log.info('ListenBrainz listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path