Ejemplo n.º 1
0
 def test_from_json_null_values(self):
     data = {
         "listened_at": 1618353413, "track_metadata": {
             "additional_info": {"recording_mbid": "99e087e1-5649-4e8c-b84f-eea05b8e143a",
                                 "release_mbid": "4b6ca48c-f7db-439d-ba57-6104b5fec61e",
                                 "artist_mbid": "e1564e98-978b-4947-8698-f6fd6f8b0181\u0000\ufeff9ad10546-b081-4cc8-a487-3d2eece82d9e\u0000\ufeff5245e5cd-4408-4d9e-a037-c71a53edce83",
                                 "artist_msid": "392f2883-724f-4c63-b155-81a7cc89a499",
                                 "release_msid": "632207f8-150f-4342-99ad-0fd5a6687e63"},
             "artist_name": "Fort Minor Feat. Holly Brook & Jonah Matranga", "track_name": "some name"}
         }
     with self.assertRaises(ValueError):
         Listen.from_json(data)
Ejemplo n.º 2
0
    def test_from_json(self):
        json_row = {"track_metadata": {"additional_info": {}}}

        json_row.update({'listened_at': 123456})
        listen = Listen.from_json(json_row)

        self.assertEqual(listen.timestamp, json_row['listened_at'])

        del json_row['listened_at']
        json_row.update({'playing_now': True})
        listen = Listen.from_json(json_row)

        self.assertEqual(listen.timestamp, None)
Ejemplo n.º 3
0
    def callback(self, ch, method, properties, body):

        listens = ujson.loads(body)

        msb_listens = []
        for chunk in chunked(listens, MAX_ITEMS_PER_MESSYBRAINZ_LOOKUP):
            msb_listens.extend(self.messybrainz_lookup(chunk))

        submit = []
        for listen in msb_listens:
            try:
                submit.append(Listen.from_json(listen))
            except ValueError:
                pass

        ret = self.insert_to_listenstore(submit)

        # If there is an error, we do not ack the message so that rabbitmq redelivers it later.
        if ret == LISTEN_INSERT_ERROR_SENTINEL:
            return ret

        while True:
            try:
                self.incoming_ch.basic_ack(delivery_tag=method.delivery_tag)
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return ret
 def get_playing_now(self, user_id):
     """ Return the current playing song of the user """
     data = self.redis.get('playing_now' + ':' + str(user_id))
     if not data:
         return None
     data = ujson.loads(data)
     data.update({'listened_at': MIN_ID + 1})
     return Listen.from_json(data)
Ejemplo n.º 5
0
    def get_recent_listens(self, max = RECENT_LISTENS_MAX):
        """
            Get the max number of most recent listens
        """
        recent = []
        for listen in cache._r.zrevrange(cache._prep_key(self.RECENT_LISTENS_KEY), 0, max - 1):
            recent.append(Listen.from_json(ujson.loads(listen)))

        return recent
Ejemplo n.º 6
0
    def import_listens_dump(self, archive_path, threads=None):
        """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive.

        Args:
            archive (str): the path to the listens dump .tar.xz archive to be imported
            threads (int): the number of threads to be used for decompression (defaults to 1)
        """

        self.log.info('Beginning import of listens from dump %s...',
                      archive_path)

        pxz_command = ['pxz', '--decompress', '--stdout', archive_path]
        if threads is not None:
            pxz_command.append('-T {threads}'.format(threads=threads))
        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            for member in tar:
                file_name = member.name.split('/')[-1]

                if file_name == 'SCHEMA_SEQUENCE':
                    self.log.info(
                        'Checking if schema version of dump matches...')
                    schema_seq = int(tar.extractfile(member).read().strip())
                    if schema_seq != LISTENS_DUMP_SCHEMA_VERSION:
                        raise SchemaMismatchException(
                            'Incorrect schema version! Expected: %d, got: %d.'
                            'Please ensure that the data dump version matches the code version'
                            'in order to import the data.' %
                            (LISTENS_DUMP_SCHEMA_VERSION, schema_seq))

                elif file_name.endswith('.listens'):

                    # remove .listens from the filename to get the username
                    user_name = file_name[:-8]
                    self.log.info('Importing user %s', user_name)
                    listens = []
                    listen_count = 0

                    # iterate through files and keep writing listens in chunks
                    for listen in tar.extractfile(member):
                        influx_listen = Listen.from_json(
                            ujson.loads(listen)).to_influx(quote(user_name))
                        listens.append(influx_listen)
                        listen_count += 1

                        if listen_count > DUMP_CHUNK_SIZE:
                            self.write_points_to_db(listens)
                            listen_count = 0
                            listens = []

                    # if some listens are left, write them to db
                    if listen_count > 0:
                        self.write_points_to_db(listens)

        self.log.info('Import of listens from dump %s done!', archive_path)
Ejemplo n.º 7
0
    def get_recent_listens(self, max=RECENT_LISTENS_MAX):
        """
            Get the max number of most recent listens
        """
        recent = []
        for listen in self.redis.zrevrange(self.ns + self.RECENT_LISTENS_KEY,
                                           0, max - 1):
            recent.append(Listen.from_json(ujson.loads(listen)))

        return recent
Ejemplo n.º 8
0
 def send_listens(self, event_name, message):
     listens = json.loads(message.body.decode("utf-8"))
     for data in listens:
         if event_name == "playing_now":
             listen = NowPlayingListen(user_id=data["user_id"],
                                       user_name=data["user_name"],
                                       data=data["track_metadata"])
         else:
             data["track_metadata"] = data["data"]
             del data["data"]
             listen = Listen.from_json(data)
         self.socketio.emit(event_name,
                            json.dumps(listen.to_api()),
                            to=listen.user_name)
     message.ack()
Ejemplo n.º 9
0
    def get_playing_now(self, user_id):
        """ Return the current playing song of the user

            Arguments:
                user_id (int): the id of the user in the db

            Returns:
                Listen object which is the currently playing song of the user

        """
        data = cache.get(self.PLAYING_NOW_KEY + str(user_id))
        if not data:
            return None
        data = ujson.loads(data)
        data.update({'playing_now': True})
        return Listen.from_json(data)
Ejemplo n.º 10
0
    def get_playing_now(self, user_id):
        """ Return the current playing song of the user

            Arguments:
                user_id (int): the id of the user in the db

            Returns:
                Listen object which is the currently playing song of the user

        """
        data = self.redis.get('playing_now:{}'.format(user_id))
        if not data:
            return None
        data = ujson.loads(data)
        data.update({'playing_now': True})
        return Listen.from_json(data)
    def get_playing_now(self, user_id):
        """ Return the current playing song of the user

            Arguments:
                user_id (int): the id of the user in the db

            Returns:
                Listen object which is the currently playing song of the user

        """
        data = self.redis.get('playing_now:{}'.format(user_id))
        if not data:
            return None
        data = ujson.loads(data)
        data.update({'listened_at': MIN_ID+1})
        return Listen.from_json(data)
Ejemplo n.º 12
0
    def callback(self, ch, method, properties, body):

        listens = ujson.loads(body)
        non_null_listens = []

        for listen in listens:
            try:
                check_recursively_for_nulls(listen)
            except ValueError:
                # temporary to make sure fix is working
                current_app.logger.error(
                    "Found null byte in listen. Skipping!", exc_info=True)
                continue
            non_null_listens.append(listen)

        msb_listens = []
        for chunk in chunked(non_null_listens,
                             MAX_ITEMS_PER_MESSYBRAINZ_LOOKUP):
            msb_listens.extend(self.messybrainz_lookup(chunk))

        submit = []
        for listen in msb_listens:
            try:
                submit.append(Listen.from_json(listen))
            except ValueError:
                pass

        ret = self.insert_to_listenstore(submit)

        # If there is an error, we do not ack the message so that rabbitmq redelivers it later.
        if ret == LISTEN_INSERT_ERROR_SENTINEL:
            return ret

        while True:
            try:
                self.incoming_ch.basic_ack(delivery_tag=method.delivery_tag)
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return ret
Ejemplo n.º 13
0
    def callback(self, ch, method, properties, body):

        listens = ujson.loads(body)

        submit = []
        for listen in listens:
            try:
                submit.append(Listen.from_json(listen))
            except ValueError:
                pass

        ret = self.insert_to_listenstore(submit)
        if not ret:
            return ret

        while True:
            try:
                self.incoming_ch.basic_ack(delivery_tag=method.delivery_tag)
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return ret
Ejemplo n.º 14
0
    def write(self, listen_dicts):
        submit = []
        unique = []
        duplicate_count = 0
        unique_count = 0

        # Partition the listens on the basis of user names
        # and then store the time range for each user
        users = {}
        for listen in listen_dicts:

            t = int(listen['listened_at'])
            user_name = listen['user_name']

            if user_name not in users:
                users[user_name] = {
                    'min_time': t,
                    'max_time': t,
                    'listens': [listen],
                }
                continue

            if t > users[user_name]['max_time']:
                users[user_name]['max_time'] = t

            if t < users[user_name]['min_time']:
                users[user_name]['min_time'] = t

            users[user_name]['listens'].append(listen)

        # get listens in the time range for each user and
        # remove duplicates on the basis of timestamps
        for user_name in users:

            # get the range of time that we need to get from influx for
            # deduplication of listens
            min_time = users[user_name]['min_time']
            max_time = users[user_name]['max_time']

            # quering for artist name here, since a field must be included in the query.
            query = """SELECT time, artist_name
                         FROM %s
                        WHERE time >= %s
                          AND time <= %s
                    """ % (get_escaped_measurement_name(user_name),
                           get_influx_query_timestamp(min_time),
                           get_influx_query_timestamp(max_time))

            while True:
                try:
                    results = self.influx.query(query)
                    break
                except Exception as e:
                    self.log.error("Cannot query influx: %s" % str(e))
                    sleep(3)

            # collect all the timestamps for this given time range.
            timestamps = {}
            for result in results.get_points(
                    measurement=get_measurement_name(user_name)):
                timestamps[convert_to_unix_timestamp(result['time'])] = 1

            for listen in users[user_name]['listens']:
                # Check if this listen is already present in Influx DB and if it is
                # mark current listen as duplicate
                t = int(listen['listened_at'])
                if t in timestamps:
                    duplicate_count += 1
                    continue
                else:
                    unique_count += 1
                    submit.append(Listen.from_json(listen))
                    unique.append(listen)
                    timestamps[t] = 1

        t0 = time()
        submitted_count = self.insert_to_listenstore(submit)
        self.time += time() - t0

        self.log.error("dups: %d, unique: %d, submitted: %d" %
                       (duplicate_count, unique_count, submitted_count))
        if not unique_count:
            return True

        while True:
            try:
                self.unique_ch.basic_publish(exchange='unique',
                                             routing_key='',
                                             body=ujson.dumps(unique),
                                             properties=pika.BasicProperties(
                                                 delivery_mode=2, ))
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return True
Ejemplo n.º 15
0
    def write(self, listen_dicts):
        submit = []
        unique = []
        duplicate_count = 0
        unique_count = 0

        # Partition the listens on the basis of user names
        # and then store the time range for each user
        users = {}
        for listen in listen_dicts:

            t = int(listen['listened_at'])
            user_name = listen['user_name']

            if user_name not in users:
                users[user_name] = {
                    'min_time': t,
                    'max_time': t,
                    'listens': [listen],
                }
                continue

            if t > users[user_name]['max_time']:
                users[user_name]['max_time'] = t

            if t < users[user_name]['min_time']:
                users[user_name]['min_time'] = t

            users[user_name]['listens'].append(listen)

        # get listens in the time range for each user and
        # remove duplicates on the basis of timestamps
        for user_name in users:

            # get the range of time that we need to get from influx for
            # deduplication of listens
            min_time = users[user_name]['min_time']
            max_time = users[user_name]['max_time']

            query = """SELECT time, recording_msid
                         FROM %s
                        WHERE time >= %s
                          AND time <= %s
                    """ % (get_escaped_measurement_name(user_name), get_influx_query_timestamp(min_time), get_influx_query_timestamp(max_time))

            while True:
                try:
                    results = self.influx.query(query)
                    break
                except Exception as e:
                    self.log.error("Cannot query influx: %s" % str(e))
                    sleep(3)

            # collect all the timestamps for this given time range.

            timestamps = defaultdict(list) # dict of list of listens indexed by timestamp
            for result in results.get_points(measurement=get_measurement_name(user_name)):
                timestamps[convert_to_unix_timestamp(result['time'])].append(result)

            for listen in users[user_name]['listens']:
                # Check if a listen with the same timestamp and recording msid is already present in
                # Influx DB and if it is, mark current listen as duplicate
                t = int(listen['listened_at'])
                recording_msid = listen['recording_msid']
                dup = False

                if t in timestamps:
                    for row in timestamps[t]:
                        if row['recording_msid'] == recording_msid:
                            duplicate_count += 1
                            dup = True
                            break
                    else:
                        # if there are listens with the same timestamp but different
                        # metadata, we add a tag specifically for making sure that
                        # influxdb doesn't drop one of the listens. This value
                        # is monotonically increasing and defaults to 0
                        listen['dedup_tag'] = len(timestamps[t])

                if not dup:
                    unique_count += 1
                    submit.append(Listen.from_json(listen))
                    unique.append(listen)
                    timestamps[t].append({
                        'time': convert_timestamp_to_influx_row_format(t),
                        'recording_msid': recording_msid
                    })

        t0 = time()
        submitted_count = self.insert_to_listenstore(submit)
        self.time += time() - t0

        self.log.error("dups: %d, unique: %d, submitted: %d" % (duplicate_count, unique_count, submitted_count))
        if not unique_count:
            return True

        while True:
            try:
                self.unique_ch.basic_publish(
                    exchange=self.config.UNIQUE_EXCHANGE,
                    routing_key='',
                    body=ujson.dumps(unique),
                    properties=pika.BasicProperties(delivery_mode = 2,),
                )
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return True
Ejemplo n.º 16
0
    def import_listens_dump(self,
                            archive_path,
                            threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Imports listens into TimescaleDB from a ListenBrainz listens dump .tar.xz archive.

        Args:
            archive (str): the path to the listens dump .tar.xz archive to be imported
            threads (int): the number of threads to be used for decompression
                           (defaults to DUMP_DEFAULT_THREAD_COUNT)

        Returns:
            int: the number of users for whom listens have been imported
        """

        self.log.info('Beginning import of listens from dump %s...',
                      archive_path)

        # construct the pxz command to decompress the archive
        pxz_command = [
            'pxz', '--decompress', '--stdout', archive_path,
            '-T{threads}'.format(threads=threads)
        ]
        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        schema_checked = False
        total_imported = 0
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            listens = []
            for member in tar:
                if member.name.endswith('SCHEMA_SEQUENCE'):
                    self.log.info(
                        'Checking if schema version of dump matches...')
                    schema_seq = int(
                        tar.extractfile(member).read().strip() or '-1')
                    if schema_seq != LISTENS_DUMP_SCHEMA_VERSION:
                        raise SchemaMismatchException(
                            'Incorrect schema version! Expected: %d, got: %d.'
                            'Please ensure that the data dump version matches the code version'
                            'in order to import the data.' %
                            (LISTENS_DUMP_SCHEMA_VERSION, schema_seq))
                    schema_checked = True

                if member.name.endswith(".listens"):
                    if not schema_checked:
                        raise SchemaMismatchException(
                            "SCHEMA_SEQUENCE file missing from listen dump.")

                    with tar.extractfile(
                            member
                    ) as tarf:  # tarf, really? That's the name you're going with? Yep.
                        while True:
                            line = tarf.readline()
                            if not line:
                                break

                            listen = Listen.from_json(ujson.loads(line))
                            listens.append(listen)

                            if len(listens) > DUMP_CHUNK_SIZE:
                                total_imported += len(listens)
                                self.insert(listens)
                                listens = []

            if len(listens) > 0:
                total_imported += len(listens)
                self.insert(listens)

        if not schema_checked:
            raise SchemaMismatchException(
                "SCHEMA_SEQUENCE file missing from listen dump.")

        self.log.info('Import of listens from dump %s done!', archive_path)
        pxz.stdout.close()

        return total_imported
    def import_listens_dump(self, archive_path, threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive.

        Args:
            archive (str): the path to the listens dump .tar.xz archive to be imported
            threads (int): the number of threads to be used for decompression
                           (defaults to DUMP_DEFAULT_THREAD_COUNT)

        Returns:
            int: the number of users for whom listens have been imported
        """

        self.log.info('Beginning import of listens from dump %s...', archive_path)

        # construct the pxz command to decompress the archive
        pxz_command = ['pxz', '--decompress', '--stdout', archive_path, '-T{threads}'.format(threads=threads)]

        # run the command once to ensure schema version is correct
        # and load the index
        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        index = None
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            schema_check_done = False
            index_loaded = False
            for member in tar:
                file_name = member.name.split('/')[-1]
                if file_name == 'SCHEMA_SEQUENCE':
                    self.log.info('Checking if schema version of dump matches...')
                    schema_seq = int(tar.extractfile(member).read().strip())
                    if schema_seq != LISTENS_DUMP_SCHEMA_VERSION:
                        raise SchemaMismatchException('Incorrect schema version! Expected: %d, got: %d.'
                                        'Please ensure that the data dump version matches the code version'
                                        'in order to import the data.'
                                        % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq))
                    schema_check_done = True

                elif file_name == 'index.json':
                    with tar.extractfile(member) as f:
                        index = ujson.load(f)
                    index_loaded = True

                if schema_check_done and index_loaded:
                    self.log.info('Schema version matched and index.json loaded!')
                    self.log.info('Starting import of listens...')
                    break
            else:
                raise SchemaMismatchException('Metadata files missing in dump, please ensure that the dump file is valid.')


        # close pxz command and start over again, this time with the aim of importing all listens
        pxz.stdout.close()

        file_contents = defaultdict(list)
        for user, info in index.items():
            file_contents[info['file_name']].append({
                'user_name': user,
                'offset': info['offset'],
                'size': info['size'],
            })

        for file_name in file_contents:
            file_contents[file_name] = sorted(file_contents[file_name], key=lambda x: x['offset'])

        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        users_done = 0
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            for member in tar:
                file_name = member.name.split('/')[-1]
                if file_name.endswith('.listens'):

                    file_name = file_name[:-8]
                    with tar.extractfile(member) as f:
                        for user in file_contents[file_name]:
                            self.log.info('Importing user %s...', user['user_name'])
                            assert(f.tell() == user['offset'])
                            bytes_read = 0
                            listens = []
                            while bytes_read < user['size']:
                                line = f.readline()
                                bytes_read += len(line)
                                listen = Listen.from_json(ujson.loads(line)).to_influx(quote(user['user_name']))
                                listens.append(listen)

                                if len(listens) > DUMP_CHUNK_SIZE:
                                    self.write_points_to_db(listens)
                                    listens = []

                            if len(listens) > 0:
                                self.write_points_to_db(listens)

                            self.log.info('Import of user %s done!', user['user_name'])
                            users_done += 1

        self.log.info('Import of listens from dump %s done!', archive_path)
        pxz.stdout.close()
        return users_done
    def import_listens_dump(self,
                            archive_path,
                            threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive.

        Args:
            archive (str): the path to the listens dump .tar.xz archive to be imported
            threads (int): the number of threads to be used for decompression
                           (defaults to DUMP_DEFAULT_THREAD_COUNT)

        Returns:
            int: the number of users for whom listens have been imported
        """

        self.log.info('Beginning import of listens from dump %s...',
                      archive_path)

        # construct the pxz command to decompress the archive
        pxz_command = [
            'pxz', '--decompress', '--stdout', archive_path,
            '-T{threads}'.format(threads=threads)
        ]

        # run the command once to ensure schema version is correct
        # and load the index
        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        index = None
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            schema_check_done = False
            index_loaded = False
            for member in tar:
                file_name = member.name.split('/')[-1]
                if file_name == 'SCHEMA_SEQUENCE':
                    self.log.info(
                        'Checking if schema version of dump matches...')
                    schema_seq = int(tar.extractfile(member).read().strip())
                    if schema_seq != LISTENS_DUMP_SCHEMA_VERSION:
                        raise SchemaMismatchException(
                            'Incorrect schema version! Expected: %d, got: %d.'
                            'Please ensure that the data dump version matches the code version'
                            'in order to import the data.' %
                            (LISTENS_DUMP_SCHEMA_VERSION, schema_seq))
                    schema_check_done = True

                elif file_name == 'index.json':
                    with tar.extractfile(member) as f:
                        index = ujson.load(f)
                    index_loaded = True

                if schema_check_done and index_loaded:
                    self.log.info(
                        'Schema version matched and index.json loaded!')
                    self.log.info('Starting import of listens...')
                    break
            else:
                raise SchemaMismatchException(
                    'Metadata files missing in dump, please ensure that the dump file is valid.'
                )

        # close pxz command and start over again, this time with the aim of importing all listens
        pxz.stdout.close()

        file_contents = defaultdict(list)
        for user, info in index.items():
            file_contents[info['file_name']].append({
                'user_name': user,
                'offset': info['offset'],
                'size': info['size'],
            })

        for file_name in file_contents:
            file_contents[file_name] = sorted(file_contents[file_name],
                                              key=lambda x: x['offset'])

        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        users_done = 0
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            for member in tar:
                file_name = member.name.split('/')[-1]
                if file_name.endswith('.listens'):

                    file_name = file_name[:-8]
                    with tar.extractfile(member) as f:
                        for user in file_contents[file_name]:
                            self.log.info('Importing user %s...',
                                          user['user_name'])
                            assert (f.tell() == user['offset'])
                            bytes_read = 0
                            listens = []
                            while bytes_read < user['size']:
                                line = f.readline()
                                bytes_read += len(line)
                                listen = Listen.from_json(
                                    ujson.loads(line)).to_influx(
                                        quote(user['user_name']))
                                listens.append(listen)

                                if len(listens) > DUMP_CHUNK_SIZE:
                                    self.write_points_to_db(listens)
                                    listens = []

                            if len(listens) > 0:
                                self.write_points_to_db(listens)

                            self.log.info('Import of user %s done!',
                                          user['user_name'])
                            users_done += 1

        self.log.info('Import of listens from dump %s done!', archive_path)
        pxz.stdout.close()
        return users_done