def test_to_influx(self):
        listen = Listen(
            timestamp=int(time.time()),
            user_name='testuser',
            artist_msid=uuid.uuid4(),
            recording_msid=uuid.uuid4(),
            dedup_tag=3,
            data={
                'artist_name': 'Radiohead',
                'track_name': 'True Love Waits',
                'additional_info': {
                    'release_type': ["ALBUM", "REMIX"],
                }
            }
        )

        data = listen.to_influx(quote(listen.user_name))

        # Make sure every value that we don't explicitly support is a string
        for key in data['fields']:
            if key not in Listen.SUPPORTED_KEYS and key not in Listen.PRIVATE_KEYS:
                self.assertIsInstance(data['fields'][key], str)

        # Check values
        self.assertEqual(data['measurement'], quote(listen.user_name))
        self.assertEqual(data['time'], listen.ts_since_epoch)
        self.assertEqual(data['tags']['dedup_tag'], listen.dedup_tag)
        self.assertEqual(data['fields']['user_name'], listen.user_name)
        self.assertEqual(data['fields']['artist_msid'], listen.artist_msid)
        self.assertEqual(data['fields']['recording_msid'], listen.recording_msid)
        self.assertEqual(data['fields']['track_name'], listen.data['track_name'])
        self.assertEqual(data['fields']['artist_name'], listen.data['artist_name'])

        self.assertIn('inserted_timestamp', data['fields'])
    def test_to_influx(self):
        listen = Listen(timestamp=int(time.time()),
                        user_name='testuser',
                        artist_msid=uuid.uuid4(),
                        recording_msid=uuid.uuid4(),
                        data={
                            'artist_name': 'Radiohead',
                            'track_name': 'True Love Waits',
                            'additional_info': {
                                'release_type': ["ALBUM", "REMIX"],
                            }
                        })

        data = listen.to_influx(quote(listen.user_name))

        # Make sure every value that we don't explicitly support is a string
        for key in data['fields']:
            if key not in Listen.SUPPORTED_KEYS:
                print(key)
                self.assertIsInstance(data['fields'][key], str)

        # Check values
        self.assertEqual(data['measurement'], quote(listen.user_name))
        self.assertEqual(data['time'], listen.ts_since_epoch)
        self.assertEqual(data['tags']['user_name'], listen.user_name)
        self.assertEqual(data['fields']['artist_msid'], listen.artist_msid)
        self.assertEqual(data['fields']['recording_msid'],
                         listen.recording_msid)
        self.assertEqual(data['fields']['track_name'],
                         listen.data['track_name'])
        self.assertEqual(data['fields']['artist_name'],
                         listen.data['artist_name'])
Exemple #3
0
    def test_to_timescale(self):
        listen = Listen(
            timestamp=int(time.time()),
            user_name='testuser',
            artist_msid=str(uuid.uuid4()),
            dedup_tag=3,
            user_id=1,
            data={
                'artist_name': 'Radiohead',
                'track_name': 'True Love Waits',
                'additional_info': {
                    'release_type': ["ALBUM", "REMIX"],
                    'recording_msid': str(uuid.uuid4()),
                }
            }
        )

        listened_at, track_name, user_name, data = listen.to_timescale()

        # Check data is of type string
        self.assertIsInstance(data, str)

        # Convert returned data to json
        json_data = ujson.loads(data)

        # Check that the required fields are dumped into data
        self.assertIn('track_metadata', json_data)
        self.assertIn('additional_info', json_data['track_metadata'])

        # Check that the required fields are dumped into data
        self.assertEqual(listened_at, listen.ts_since_epoch)
        self.assertEqual(track_name, listen.data['track_name'])
        self.assertEqual(user_name, listen.user_name)
        self.assertEqual(json_data['user_id'], listen.user_id)
        self.assertEqual(json_data['track_metadata']['artist_name'], listen.data['artist_name'])
Exemple #4
0
 def test_from_json_null_values(self):
     data = {
         "listened_at": 1618353413, "track_metadata": {
             "additional_info": {"recording_mbid": "99e087e1-5649-4e8c-b84f-eea05b8e143a",
                                 "release_mbid": "4b6ca48c-f7db-439d-ba57-6104b5fec61e",
                                 "artist_mbid": "e1564e98-978b-4947-8698-f6fd6f8b0181\u0000\ufeff9ad10546-b081-4cc8-a487-3d2eece82d9e\u0000\ufeff5245e5cd-4408-4d9e-a037-c71a53edce83",
                                 "artist_msid": "392f2883-724f-4c63-b155-81a7cc89a499",
                                 "release_msid": "632207f8-150f-4342-99ad-0fd5a6687e63"},
             "artist_name": "Fort Minor Feat. Holly Brook & Jonah Matranga", "track_name": "some name"}
         }
     with self.assertRaises(ValueError):
         Listen.from_json(data)
Exemple #5
0
    def test_from_json(self):
        json_row = {"track_metadata": {"additional_info": {}}}

        json_row.update({'listened_at': 123456})
        listen = Listen.from_json(json_row)

        self.assertEqual(listen.timestamp, json_row['listened_at'])

        del json_row['listened_at']
        json_row.update({'playing_now': True})
        listen = Listen.from_json(json_row)

        self.assertEqual(listen.timestamp, None)
Exemple #6
0
    def callback(self, ch, method, properties, body):

        listens = ujson.loads(body)

        msb_listens = []
        for chunk in chunked(listens, MAX_ITEMS_PER_MESSYBRAINZ_LOOKUP):
            msb_listens.extend(self.messybrainz_lookup(chunk))

        submit = []
        for listen in msb_listens:
            try:
                submit.append(Listen.from_json(listen))
            except ValueError:
                pass

        ret = self.insert_to_listenstore(submit)

        # If there is an error, we do not ack the message so that rabbitmq redelivers it later.
        if ret == LISTEN_INSERT_ERROR_SENTINEL:
            return ret

        while True:
            try:
                self.incoming_ch.basic_ack(delivery_tag=method.delivery_tag)
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return ret
    def fetch_listens_from_storage(self, user_name, from_ts, to_ts, limit, order):
        """ The timestamps are stored as UTC in the postgres datebase while on retrieving
            the value they are converted to the local server's timezone. So to compare
            datetime object we need to create a object in the same timezone as the server.

            from_ts: seconds since epoch, in float
            to_ts: seconds since epoch, in float
        """

        # Quote single quote characters which could be used to mount an injection attack.
        # Sadly, influxdb does not provide a means to do this in the client library
        query = 'SELECT * FROM ' + get_escaped_measurement_name(user_name)

        if from_ts is not None:
            query += "WHERE time > " + get_influx_query_timestamp(from_ts)
        else:
            query += "WHERE time < " + get_influx_query_timestamp(to_ts)

        query += " ORDER BY time " + ORDER_TEXT[order] + " LIMIT " + str(limit)
        try:
            results = self.influx.query(query)
        except Exception as err:
            self.log.error("Cannot query influx while getting listens for user: %s: %s", user_name, str(err), exc_info=True)
            return []

        listens = []
        for result in results.get_points(measurement=get_measurement_name(user_name)):
            listens.append(Listen.from_influx(result))

        if order == ORDER_ASC:
            listens.reverse()

        return listens
def create_test_data_for_timescalelistenstore(user_name: str, user_id: int, test_data_file_name: str = None):
    """Create listens for timescalelistenstore tests.

    From a json file in testdata it creates Listen objects with a specified user_name for tests.

    Args:
        user_name: MusicBrainz username of a user.
        user_id: listenbrainz row id of the user
        test_data_file_name: If specified use the given file to create Listen objects.
                                   DEFAULT = 'timescale_listenstore_test_listens.json'

    Returns:
        A list of Listen objects.
    """
    if not test_data_file_name:
        test_data_file_name = 'timescale_listenstore_test_listens.json'

    test_data_file = os.path.join(TEST_DATA_PATH, test_data_file_name)
    with open(test_data_file, 'r') as f:
        listens = json.load(f)

    test_data = []
    for listen in listens['payload']:
        listen['user_name'] = user_name
        listen['user_id'] = user_id
        test_data.append(Listen().from_json(listen))

    return test_data
    def fetch_listens_from_storage(self, user_name, from_ts, to_ts, limit,
                                   order):
        """ The timestamps are stored as UTC in the postgres datebase while on retrieving
            the value they are converted to the local server's timezone. So to compare
            datetime object we need to create a object in the same timezone as the server.

            from_ts: seconds since epoch, in float
            to_ts: seconds since epoch, in float
        """

        # Quote single quote characters which could be used to mount an injection attack.
        # Sadly, influxdb does not provide a means to do this in the client library
        query = 'SELECT * FROM ' + get_escaped_measurement_name(user_name)

        if from_ts is not None:
            query += "WHERE time > " + get_influx_query_timestamp(from_ts)
        else:
            query += "WHERE time < " + get_influx_query_timestamp(to_ts)

        query += " ORDER BY time " + ORDER_TEXT[order] + " LIMIT " + str(limit)
        try:
            results = self.influx.query(query)
        except Exception as err:
            self.log.error("Cannot query influx: %s" % str(err))
            return []

        listens = []
        for result in results.get_points(
                measurement=get_measurement_name(user_name)):
            listens.append(Listen.from_influx(result))

        if order == ORDER_ASC:
            listens.reverse()

        return listens
Exemple #10
0
def generate_data(test_user_id, user_name, from_ts, num_records, inserted_ts=None):
    test_data = []
    artist_msid = str(uuid.uuid4())

    for i in range(num_records):
        if not inserted_ts:
            inserted_timestamp = datetime.utcnow()
        else:
            inserted_timestamp = datetime.utcfromtimestamp(inserted_ts)
        timestamp = datetime.utcfromtimestamp(from_ts)
        item = Listen(
            user_name=user_name,
            user_id=test_user_id,
            timestamp=timestamp,
            artist_msid=artist_msid,
            recording_msid=str(uuid.uuid4()),
            inserted_timestamp=inserted_timestamp,
            data={
                'artist_name': 'Frank Ocean',
                'track_name': 'Crack Rock',
                'additional_info': {},
            },
        )
        test_data.append(item)
        from_ts += 1   # Add one second
        if inserted_ts:
            inserted_ts += 1   # Add one second

    return test_data
Exemple #11
0
def generate_data(test_user_id, user_name, from_ts, num_records):
    test_data = []
    artist_msid = str(uuid.uuid4())

    if from_ts == None:  #check for playing now listens
        timestamp = None
    else:
        from_ts += 1  # Add one second
        timestamp = datetime.utcfromtimestamp(from_ts)

    for i in range(num_records):
        item = Listen(
            user_name=user_name,
            user_id=test_user_id,
            timestamp=timestamp,
            artist_msid=artist_msid,
            recording_msid=str(uuid.uuid4()),
            data={
                'artist_name': 'Frank Ocean',
                'track_name': 'Crack Rock',
                'additional_info': {},
            },
        )
        test_data.append(item)
    return test_data
    def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600):
        """ Fetch recent listens for a list of users, given a limit which applies per user. If you
            have a limit of 3 and 3 users you should get 9 listens if they are available.

            user_list: A list containing the users for which you'd like to retrieve recent listens.
            limit: the maximum number of listens for each user to fetch.
            max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds
        """

        args = {'user_list': tuple(user_list), 'ts': int(time.time()) - max_age, 'limit': limit}
        query = """SELECT * FROM (
                              SELECT listened_at, track_name, user_name, created, data,
                                     row_number() OVER (partition by user_name ORDER BY listened_at DESC) AS rownum
                                FROM listen
                               WHERE user_name IN :user_list
                                 AND listened_at > :ts
                            GROUP BY user_name, listened_at, track_name, created, data
                            ORDER BY listened_at DESC) tmp
                           WHERE rownum <= :limit"""

        listens = []
        with timescale.engine.connect() as connection:
            curs = connection.execute(sqlalchemy.text(query), args)
            while True:
                result = curs.fetchone()
                if not result:
                    break

                listens.append(Listen.from_timescale(result[0], result[1], result[2], result[3], result[4]))

        return listens
Exemple #13
0
    def test_update_and_get_recent_listens(self):

        recent = self._redis.get_recent_listens()
        self.assertEqual(recent, [])

        listens = []
        t = int(time.time())
        for i in range(RedisListenStore.RECENT_LISTENS_MAX * 3):
            listen = Listen(user_id=self.testuser['id'],
                            user_name=self.testuser['musicbrainz_id'],
                            timestamp=t - i,
                            data={
                                'artist_name': str(uuid.uuid4()),
                                'track_name': str(uuid.uuid4()),
                                'additional_info': {},
                            })
            listens.append(listen)
            self._redis.update_recent_listens(listens)

        recent = self._redis.get_recent_listens()
        self.assertEqual(len(recent), RedisListenStore.RECENT_LISTENS_MAX)
        self.assertIsInstance(recent[0], Listen)
        for i, r in enumerate(recent):
            self.assertEqual(r.timestamp, listens[i].timestamp)

        recent = self._redis.get_recent_listens(5)
        self.assertEqual(len(recent), 5)
        for i, r in enumerate(recent):
            self.assertEqual(r.timestamp, listens[i].timestamp)
Exemple #14
0
def generate_data(from_date, num_records, user_name):
    test_data = []
    current_date = to_epoch(from_date)
    artist_msid = str(uuid.uuid4())

    user = db_user.get_by_mb_id(user_name)
    if not user:
        db_user.create(user_name)
        user = db_user.get_by_mb_id(user_name)

    for i in range(num_records):
        current_date += 1   # Add one second
        item = Listen(
            user_id=user['id'],
            user_name=user_name,
            timestamp=datetime.utcfromtimestamp(current_date),
            artist_msid=artist_msid,
            recording_msid=str(uuid.uuid4()),
            release_msid=str(uuid.uuid4()),
            data={
                'artist_name': 'Test Artist Pls ignore',
                'track_name': 'Hello Goodbye',
                'additional_info': {},
            },
        )
        test_data.append(item)
    return test_data
    def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600):
        """ Fetch recent listens for a list of users, given a limit which applies per user. If you 
            have a limit of 3 and 3 users you should get 9 listens if they are available.

            user_list: A list containing the users for which you'd like to retrieve recent listens.
            limit: the maximum number of listens for each user to fetch.
            max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds
        """

        escaped_user_list = []
        for user_name in user_list:
            escaped_user_list.append(get_escaped_measurement_name(user_name))

        query = "SELECT username, * FROM " + ",".join(escaped_user_list)
        query += " WHERE time > " + get_influx_query_timestamp(
            int(time.time()) - max_age)
        query += " ORDER BY time DESC LIMIT " + str(limit)
        try:
            results = self.influx.query(query)
        except Exception as err:
            self.log.error(
                "Cannot query influx while getting listens for users: %s: %s",
                user_list,
                str(err),
                exc_info=True)
            return []

        listens = []
        for user in user_list:
            for result in results.get_points(
                    measurement=get_measurement_name(user)):
                l = Listen.from_influx(result)
                listens.append(l)

        return listens
Exemple #16
0
    def fetch_listens_for_multiple_users_from_storage(self, user_names: List[str], from_ts: float, to_ts: float, limit: int, order: int, time_range: int=3):
        """ The timestamps are stored as UTC in the postgres datebase while on retrieving
            the value they are converted to the local server's timezone. So to compare
            datetime object we need to create a object in the same timezone as the server.

            from_ts: seconds since epoch, in float
            to_ts: seconds since epoch, in float
            limit: the maximum number of items to return
            order: 0 for ASCending order, 1 for DESCending order
            time_range: the time range (in units of 5 days) to search for listens. If none is given
                        3 ranges (15 days) are searched. If -1 is given then all listens are searched
                        which is slow and should be avoided if at all possible.
        """

        if time_range is None:
            time_range = 3

        if time_range < 0:
            max_timestamp_window = -1
        else:
            max_timestamp_window = SECONDS_IN_TIME_RANGE * time_range
            if to_ts is None:
                to_ts = from_ts + max_timestamp_window
            elif from_ts is None:
                from_ts = to_ts - max_timestamp_window

        query = """SELECT listened_at, track_name, created, data, user_name
                     FROM listen
                    WHERE user_name IN :user_names """


        if max_timestamp_window < 0:
            if from_ts and to_ts:
                query += """AND listened_at > :from_ts
                            AND listened_at < :to_ts """
            elif from_ts is not None:
                query += "AND listened_at > :from_ts "
            else:
                query += "AND listened_at < :to_ts "
        else:
            query += """AND listened_at > :from_ts
                        AND listened_at < :to_ts """

        query += "ORDER BY listened_at " + ORDER_TEXT[order] + " LIMIT :limit"

        listens = []
        with timescale.engine.connect() as connection:
            curs = connection.execute(sqlalchemy.text(query), user_names=tuple(user_names), from_ts=from_ts, to_ts=to_ts, limit=limit)
            while True:
                result = curs.fetchone()
                if not result:
                    break

                listens.append(Listen.from_timescale(result[0], result[1], result[4], result[2], result[3]))

        if order == ORDER_ASC:
            listens.reverse()

        return listens
Exemple #17
0
 def convert_row(self, row):
     return Listen(user_id=row[1],
                   user_name=row[2],
                   timestamp=row[3],
                   artist_msid=row[4],
                   release_msid=row[5],
                   recording_msid=row[6],
                   data=row[7])
 def get_playing_now(self, user_id):
     """ Return the current playing song of the user """
     data = self.redis.get('playing_now' + ':' + str(user_id))
     if not data:
         return None
     data = ujson.loads(data)
     data.update({'listened_at': MIN_ID + 1})
     return Listen.from_json(data)
 def _create_test_data(self, user_name):
     test_data = []
     for jdata in TEST_LISTEN_JSON:
         x = ujson.loads(jdata)
         x['user_name'] = user_name
         test_data.append(Listen().from_json(x))
     self.logstore.insert(test_data)
     return len(test_data)
    def test_from_influx(self):
        """ Test for the from_influx method """

        influx_row = {
            "time": "2017-06-07T17:23:05Z",
            "artist_mbids": "abaa7001-0d80-4e58-be5d-d2d246fd9d87",
            "artist_msid": "aa6130f2-a12d-47f3-8ffd-d0f71340de1f",
            "artist_name": "Majid Jordan",
            "best_song": "definitely",
            "genius_link": "https://genius.com/Majid-jordan-every-step-every-way-lyrics",
            "lastfm_link": "https://www.last.fm/music/Majid+Jordan/_/Every+Step+Every+Way",
            "other_stuff": "teststuffplsignore",
            "recording_mbid": None,
            "recording_msid": "db9a7483-a8f4-4a2c-99af-c8ab58850200",
            "release_msid": "cf138a00-05d5-4b35-8fce-181efcc15785",
            "release_name": "Majid Jordan",
            "track_name": "Every Step Every Way",
            "user_name": "iliekcomputers",
            "we_dict_now.hello": "afb",
            "we_dict_now.we_nested_now.hi": "312",
            "tags": "sing, song",
            "inserted_timestamp": 1525557084,
        }

        listen = Listen.from_influx(influx_row)

        # Check user name
        self.assertEqual(listen.user_name, influx_row['user_name'])

        # Check time stamp
        dt = datetime.strptime(influx_row['time'] , '%Y-%m-%dT%H:%M:%SZ')
        ts = int(dt.strftime("%s"))
        self.assertEqual(listen.ts_since_epoch, ts)

        # Check artist mbids
        self.assertIsInstance(listen.data['additional_info']['artist_mbids'], list)
        self.assertEqual(listen.data['additional_info']['artist_mbids'], influx_row['artist_mbids'].split(','))

        # Check tags
        self.assertIsInstance(listen.data['additional_info']['tags'], list)
        self.assertEqual(listen.data['additional_info']['tags'], influx_row['tags'].split(','))

        # Check track name
        self.assertEqual(listen.data['track_name'], influx_row['track_name'])

        # Check additional info
        self.assertEqual(listen.data['additional_info']['best_song'], influx_row['best_song'])

        # Check msids
        self.assertEqual(listen.artist_msid, influx_row['artist_msid'])
        self.assertEqual(listen.release_msid, influx_row['release_msid'])
        self.assertEqual(listen.recording_msid, influx_row['recording_msid'])

        # make sure additional info does not contain stuff like artist names, track names
        self.assertNotIn('track_name', listen.data['additional_info'])
        self.assertNotIn('artist_name', listen.data['additional_info'])
        self.assertNotIn('release_name', listen.data['additional_info'])
    def test_from_influx(self):
        """ Test for the from_influx method """

        influx_row = {
            "time": "2017-06-07T17:23:05Z",
            "artist_mbids": "abaa7001-0d80-4e58-be5d-d2d246fd9d87",
            "artist_msid": "aa6130f2-a12d-47f3-8ffd-d0f71340de1f",
            "artist_name": "Majid Jordan",
            "best_song": "definitely",
            "genius_link":
            "https://genius.com/Majid-jordan-every-step-every-way-lyrics",
            "lastfm_link":
            "https://www.last.fm/music/Majid+Jordan/_/Every+Step+Every+Way",
            "other_stuff": "teststuffplsignore",
            "recording_mbid": None,
            "recording_msid": "db9a7483-a8f4-4a2c-99af-c8ab58850200",
            "release_msid": "cf138a00-05d5-4b35-8fce-181efcc15785",
            "release_name": "Majid Jordan",
            "track_name": "Every Step Every Way",
            "user_name": "iliekcomputers",
            "we_dict_now.hello": "afb",
            "we_dict_now.we_nested_now.hi": "312",
            "tags": "sing, song"
        }

        listen = Listen.from_influx(influx_row)

        # Check user name
        self.assertEqual(listen.user_name, influx_row['user_name'])

        # Check time stamp
        dt = datetime.strptime(influx_row['time'], '%Y-%m-%dT%H:%M:%SZ')
        ts = int(dt.strftime("%s"))
        self.assertEqual(listen.ts_since_epoch, ts)

        # Check artist mbids
        self.assertIsInstance(listen.data['additional_info']['artist_mbids'],
                              list)
        self.assertEqual(listen.data['additional_info']['artist_mbids'],
                         influx_row['artist_mbids'].split(','))

        # Check tags
        self.assertIsInstance(listen.data['additional_info']['tags'], list)
        self.assertEqual(listen.data['additional_info']['tags'],
                         influx_row['tags'].split(','))

        # Check track name
        self.assertEqual(listen.data['track_name'], influx_row['track_name'])

        # Check additional info
        self.assertEqual(listen.data['additional_info']['best_song'],
                         influx_row['best_song'])

        # Check msids
        self.assertEqual(listen.artist_msid, influx_row['artist_msid'])
        self.assertEqual(listen.release_msid, influx_row['release_msid'])
        self.assertEqual(listen.recording_msid, influx_row['recording_msid'])
    def dump_user(self, username, fileobj, dump_time):
        """ Dump specified user's listens into specified file object.

        Args:
            username (str): the MusicBrainz ID of the user whose listens are to be dumped
            fileobj (file): the file into which listens should be written
            dump_time (datetime): the time at which the specific data dump was initiated
            spark_format (bool): dump files in Apache Spark friendly format if True, else full dumps

        Returns:
            int: the number of bytes this user's listens take in the dump file
        """
        t0 = time.time()
        offset = 0
        bytes_written = 0
        listen_count = 0

        # Get this user's listens in chunks
        while True:
            result = self.get_listens_batch_for_dump(username, dump_time,
                                                     offset)
            rows_added = 0
            for row in result.get_points(get_measurement_name(username)):
                listen = Listen.from_influx(row).to_api()
                listen['user_name'] = username
                try:
                    bytes_written += fileobj.write(ujson.dumps(listen))
                    bytes_written += fileobj.write('\n')
                    rows_added += 1
                except IOError as e:
                    self.log.critical(
                        'IOError while writing listens into file for user %s',
                        username,
                        exc_info=True)
                    raise
                except Exception as e:
                    self.log.error(
                        'Exception while creating json for user %s: %s',
                        username,
                        str(e),
                        exc_info=True)
                    raise

            listen_count += rows_added
            if not rows_added:
                break

            offset += DUMP_CHUNK_SIZE

        time_taken = time.time() - t0
        self.log.info(
            'Listens for user %s dumped, total %d listens written at %.2f listens / sec!',
            username, listen_count, listen_count / time_taken)

        # the size for this user should not include the last newline we wrote
        # hence return bytes_written - 1 as the size in the dump for this user
        return bytes_written - 1
    def get_recent_listens(self, max = RECENT_LISTENS_MAX):
        """
            Get the max number of most recent listens
        """
        recent = []
        for listen in cache._r.zrevrange(cache._prep_key(self.RECENT_LISTENS_KEY), 0, max - 1):
            recent.append(Listen.from_json(ujson.loads(listen)))

        return recent
Exemple #24
0
    def import_listens_dump(self, archive_path, threads=None):
        """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive.

        Args:
            archive (str): the path to the listens dump .tar.xz archive to be imported
            threads (int): the number of threads to be used for decompression (defaults to 1)
        """

        self.log.info('Beginning import of listens from dump %s...',
                      archive_path)

        pxz_command = ['pxz', '--decompress', '--stdout', archive_path]
        if threads is not None:
            pxz_command.append('-T {threads}'.format(threads=threads))
        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            for member in tar:
                file_name = member.name.split('/')[-1]

                if file_name == 'SCHEMA_SEQUENCE':
                    self.log.info(
                        'Checking if schema version of dump matches...')
                    schema_seq = int(tar.extractfile(member).read().strip())
                    if schema_seq != LISTENS_DUMP_SCHEMA_VERSION:
                        raise SchemaMismatchException(
                            'Incorrect schema version! Expected: %d, got: %d.'
                            'Please ensure that the data dump version matches the code version'
                            'in order to import the data.' %
                            (LISTENS_DUMP_SCHEMA_VERSION, schema_seq))

                elif file_name.endswith('.listens'):

                    # remove .listens from the filename to get the username
                    user_name = file_name[:-8]
                    self.log.info('Importing user %s', user_name)
                    listens = []
                    listen_count = 0

                    # iterate through files and keep writing listens in chunks
                    for listen in tar.extractfile(member):
                        influx_listen = Listen.from_json(
                            ujson.loads(listen)).to_influx(quote(user_name))
                        listens.append(influx_listen)
                        listen_count += 1

                        if listen_count > DUMP_CHUNK_SIZE:
                            self.write_points_to_db(listens)
                            listen_count = 0
                            listens = []

                    # if some listens are left, write them to db
                    if listen_count > 0:
                        self.write_points_to_db(listens)

        self.log.info('Import of listens from dump %s done!', archive_path)
Exemple #25
0
    def get_recent_listens(self, max=RECENT_LISTENS_MAX):
        """
            Get the max number of most recent listens
        """
        recent = []
        for listen in self.redis.zrevrange(self.ns + self.RECENT_LISTENS_KEY,
                                           0, max - 1):
            recent.append(Listen.from_json(ujson.loads(listen)))

        return recent
def generate_data(test_user_id, from_ts, num_records):
    test_data = []
    artist_msid = str(uuid.uuid4())

    for i in range(num_records):
        from_ts += 1  # Add one second
        item = Listen(user_id=test_user_id,
                      timestamp=datetime.utcfromtimestamp(from_ts),
                      artist_msid=artist_msid,
                      recording_msid=str(uuid.uuid4()))
        test_data.append(item)
    return test_data
Exemple #27
0
 def send_listens(self, event_name, message):
     listens = json.loads(message.body.decode("utf-8"))
     for data in listens:
         if event_name == "playing_now":
             listen = NowPlayingListen(user_id=data["user_id"],
                                       user_name=data["user_name"],
                                       data=data["track_metadata"])
         else:
             data["track_metadata"] = data["data"]
             del data["data"]
             listen = Listen.from_json(data)
         self.socketio.emit(event_name,
                            json.dumps(listen.to_api()),
                            to=listen.user_name)
     message.ack()
Exemple #28
0
    def write_incremental_listens(self, start_time, end_time, temp_dir):
        """ Dump listens in the format for the ListenBrainz dump.

        Args:
            start_time and end_time (datetime): the range of time for the listens dump.
            temp_dir (str): the dir to use to write files before adding to archive
        """
        t0 = time.monotonic()
        offset = 0
        listen_count = 0

        unwritten_listens = {}

        while True:
            query, args = self.get_incremental_listens_query_batch(
                start_time, end_time, offset)
            rows_added = 0
            with timescale.engine.connect() as connection:
                curs = connection.execute(sqlalchemy.text(query), args)
                while True:
                    result = curs.fetchone()
                    if not result:
                        break

                    listen = Listen.from_timescale(result[0], result[1],
                                                   result[2], result[3],
                                                   result[4]).to_json()
                    timestamp = listen['timestamp']

                    if timestamp.year not in unwritten_listens:
                        unwritten_listens[timestamp.year] = {}
                    if timestamp.month not in unwritten_listens[
                            timestamp.year]:
                        unwritten_listens[timestamp.year][timestamp.month] = []

                    unwritten_listens[timestamp.year][timestamp.month].append(
                        listen)
                    rows_added += 1

            if rows_added == 0:
                break

            listen_count += rows_added
            offset += DUMP_CHUNK_SIZE

        self.write_incremental_listens_to_disk(unwritten_listens, temp_dir)
        self.log.info("%d listens dumped at %.2f listens / sec", listen_count,
                      listen_count / (time.monotonic() - t0))
    def get_playing_now(self, user_id):
        """ Return the current playing song of the user

            Arguments:
                user_id (int): the id of the user in the db

            Returns:
                Listen object which is the currently playing song of the user

        """
        data = self.redis.get('playing_now:{}'.format(user_id))
        if not data:
            return None
        data = ujson.loads(data)
        data.update({'listened_at': MIN_ID+1})
        return Listen.from_json(data)
Exemple #30
0
    def get_playing_now(self, user_id):
        """ Return the current playing song of the user

            Arguments:
                user_id (int): the id of the user in the db

            Returns:
                Listen object which is the currently playing song of the user

        """
        data = self.redis.get('playing_now:{}'.format(user_id))
        if not data:
            return None
        data = ujson.loads(data)
        data.update({'playing_now': True})
        return Listen.from_json(data)
    def get_playing_now(self, user_id):
        """ Return the current playing song of the user

            Arguments:
                user_id (int): the id of the user in the db

            Returns:
                Listen object which is the currently playing song of the user

        """
        data = cache.get(self.PLAYING_NOW_KEY + str(user_id))
        if not data:
            return None
        data = ujson.loads(data)
        data.update({'playing_now': True})
        return Listen.from_json(data)
    def callback(self, ch, method, properties, body):

        listens = ujson.loads(body)
        non_null_listens = []

        for listen in listens:
            try:
                check_recursively_for_nulls(listen)
            except ValueError:
                # temporary to make sure fix is working
                current_app.logger.error(
                    "Found null byte in listen. Skipping!", exc_info=True)
                continue
            non_null_listens.append(listen)

        msb_listens = []
        for chunk in chunked(non_null_listens,
                             MAX_ITEMS_PER_MESSYBRAINZ_LOOKUP):
            msb_listens.extend(self.messybrainz_lookup(chunk))

        submit = []
        for listen in msb_listens:
            try:
                submit.append(Listen.from_json(listen))
            except ValueError:
                pass

        ret = self.insert_to_listenstore(submit)

        # If there is an error, we do not ack the message so that rabbitmq redelivers it later.
        if ret == LISTEN_INSERT_ERROR_SENTINEL:
            return ret

        while True:
            try:
                self.incoming_ch.basic_ack(delivery_tag=method.delivery_tag)
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return ret
Exemple #33
0
    def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600):
        """ Fetch recent listens for a list of users, given a limit which applies per user. If you
            have a limit of 3 and 3 users you should get 9 listens if they are available.

            user_list: A list containing the users for which you'd like to retrieve recent listens.
            limit: the maximum number of listens for each user to fetch.
            max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds
        """

        args = {
            'user_list': tuple(user_list),
            'ts': int(time.time()) - max_age,
            'limit': limit
        }
        query = """SELECT * FROM (
                              SELECT listened_at, track_name, user_name, created, data, recording_mbid, release_mbid, artist_mbids,
                                     row_number() OVER (partition by user_name ORDER BY listened_at DESC) AS rownum
                                FROM listen l
                     FULL OUTER JOIN listen_join_listen_mbid_mapping lj
                                  ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = lj.recording_msid
                     FULL OUTER JOIN listen_mbid_mapping m
                                  ON lj.listen_mbid_mapping = m.id
                               WHERE user_name IN :user_list
                                 AND listened_at > :ts
                            GROUP BY user_name, listened_at, track_name, created, data, recording_mbid, release_mbid, artist_mbids
                            ORDER BY listened_at DESC) tmp
                               WHERE rownum <= :limit"""

        listens = []
        with timescale.engine.connect() as connection:
            curs = connection.execute(sqlalchemy.text(query), args)
            while True:
                result = curs.fetchone()
                if not result:
                    break

                listens.append(Listen.from_timescale(*result[0:8]))

        return listens
def create_test_data_for_influxlistenstore(user_name):
    """Create listens for influxlistenstore tests.

    From a json file 'influx_listenstore_test_listens.json' in testdata
    it creates Listen objects with a specified user_name for tests.

    Args:
        user_name (str): MusicBrainz username of a user.

    Returns:
        A list of Listen objects.
    """
    test_data_file = os.path.join(TEST_DATA_PATH,
                                  'influx_listenstore_test_listens.json')
    with open(test_data_file, 'r') as f:
        listens = json.load(f)

    test_data = []
    for listen in listens['payload']:
        listen['user_name'] = user_name
        test_data.append(Listen().from_json(listen))

    return test_data
    def import_listens_dump(self, archive_path, threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive.

        Args:
            archive (str): the path to the listens dump .tar.xz archive to be imported
            threads (int): the number of threads to be used for decompression
                           (defaults to DUMP_DEFAULT_THREAD_COUNT)

        Returns:
            int: the number of users for whom listens have been imported
        """

        self.log.info('Beginning import of listens from dump %s...', archive_path)

        # construct the pxz command to decompress the archive
        pxz_command = ['pxz', '--decompress', '--stdout', archive_path, '-T{threads}'.format(threads=threads)]

        # run the command once to ensure schema version is correct
        # and load the index
        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        index = None
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            schema_check_done = False
            index_loaded = False
            for member in tar:
                file_name = member.name.split('/')[-1]
                if file_name == 'SCHEMA_SEQUENCE':
                    self.log.info('Checking if schema version of dump matches...')
                    schema_seq = int(tar.extractfile(member).read().strip())
                    if schema_seq != LISTENS_DUMP_SCHEMA_VERSION:
                        raise SchemaMismatchException('Incorrect schema version! Expected: %d, got: %d.'
                                        'Please ensure that the data dump version matches the code version'
                                        'in order to import the data.'
                                        % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq))
                    schema_check_done = True

                elif file_name == 'index.json':
                    with tar.extractfile(member) as f:
                        index = ujson.load(f)
                    index_loaded = True

                if schema_check_done and index_loaded:
                    self.log.info('Schema version matched and index.json loaded!')
                    self.log.info('Starting import of listens...')
                    break
            else:
                raise SchemaMismatchException('Metadata files missing in dump, please ensure that the dump file is valid.')


        # close pxz command and start over again, this time with the aim of importing all listens
        pxz.stdout.close()

        file_contents = defaultdict(list)
        for user, info in index.items():
            file_contents[info['file_name']].append({
                'user_name': user,
                'offset': info['offset'],
                'size': info['size'],
            })

        for file_name in file_contents:
            file_contents[file_name] = sorted(file_contents[file_name], key=lambda x: x['offset'])

        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        users_done = 0
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            for member in tar:
                file_name = member.name.split('/')[-1]
                if file_name.endswith('.listens'):

                    file_name = file_name[:-8]
                    with tar.extractfile(member) as f:
                        for user in file_contents[file_name]:
                            self.log.info('Importing user %s...', user['user_name'])
                            assert(f.tell() == user['offset'])
                            bytes_read = 0
                            listens = []
                            while bytes_read < user['size']:
                                line = f.readline()
                                bytes_read += len(line)
                                listen = Listen.from_json(ujson.loads(line)).to_influx(quote(user['user_name']))
                                listens.append(listen)

                                if len(listens) > DUMP_CHUNK_SIZE:
                                    self.write_points_to_db(listens)
                                    listens = []

                            if len(listens) > 0:
                                self.write_points_to_db(listens)

                            self.log.info('Import of user %s done!', user['user_name'])
                            users_done += 1

        self.log.info('Import of listens from dump %s done!', archive_path)
        pxz.stdout.close()
        return users_done
    def dump_user(self, username, fileobj, dump_time):
        """ Dump specified user's listens into specified file object.

        Args:
            username (str): the MusicBrainz ID of the user whose listens are to be dumped
            fileobj (file): the file into which listens should be written
            dump_time (datetime): the time at which the specific data dump was initiated

        Returns:
            int: the number of bytes this user's listens take in the dump file
        """
        t0 = time.time()
        offset = 0
        bytes_written = 0
        listen_count = 0

        # Get this user's listens in chunks
        while True:
            # loop until we get this chunk of listens
            while True:
                try:
                    result = self.influx.query("""
                        SELECT *
                          FROM {measurement}
                         WHERE time <= {timestamp}
                      ORDER BY time DESC
                         LIMIT {limit}
                        OFFSET {offset}
                    """.format(
                        measurement=get_escaped_measurement_name(username),
                        timestamp=get_influx_query_timestamp(dump_time.strftime('%s')),
                        limit=DUMP_CHUNK_SIZE,
                        offset=offset,
                    ))
                    break
                except Exception as e:
                    self.log.error('Error while getting listens to dump for user %s: %s', user['musicbrainz_id'], str(e), exc_info=True)
                    time.sleep(3)

            rows_added = 0
            for row in result.get_points(get_measurement_name(username)):
                listen = Listen.from_influx(row).to_api()
                listen['user_name'] = username
                try:
                    bytes_written += fileobj.write(ujson.dumps(listen))
                    bytes_written += fileobj.write('\n')
                    rows_added += 1
                except IOError as e:
                    self.log.critical('IOError while writing listens into file for user %s', username, exc_info=True)
                    raise
                except Exception as e:
                    self.log.error('Exception while creating json for user %s: %s', user['musicbrainz_id'], str(e), exc_info=True)
                    raise

            listen_count += rows_added
            if not rows_added:
                break

            offset += DUMP_CHUNK_SIZE

        time_taken = time.time() - t0
        self.log.info('Listens for user %s dumped, total %d listens written at %.2f listens / sec!',
            username, listen_count, listen_count / time_taken)

        # the size for this user should not include the last newline we wrote
        # hence return bytes_written - 1 as the size in the dump for this user
        return bytes_written - 1