def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600):
        """ Fetch recent listens for a list of users, given a limit which applies per user. If you
            have a limit of 3 and 3 users you should get 9 listens if they are available.

            user_list: A list containing the users for which you'd like to retrieve recent listens.
            limit: the maximum number of listens for each user to fetch.
            max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds
        """

        args = {'user_list': tuple(user_list), 'ts': int(time.time()) - max_age, 'limit': limit}
        query = """SELECT * FROM (
                              SELECT listened_at, track_name, user_name, created, data,
                                     row_number() OVER (partition by user_name ORDER BY listened_at DESC) AS rownum
                                FROM listen
                               WHERE user_name IN :user_list
                                 AND listened_at > :ts
                            GROUP BY user_name, listened_at, track_name, created, data
                            ORDER BY listened_at DESC) tmp
                           WHERE rownum <= :limit"""

        listens = []
        with timescale.engine.connect() as connection:
            curs = connection.execute(sqlalchemy.text(query), args)
            while True:
                result = curs.fetchone()
                if not result:
                    break

                listens.append(Listen.from_timescale(result[0], result[1], result[2], result[3], result[4]))

        return listens
Exemple #2
0
    def fetch_listens_for_multiple_users_from_storage(self, user_names: List[str], from_ts: float, to_ts: float, limit: int, order: int, time_range: int=3):
        """ The timestamps are stored as UTC in the postgres datebase while on retrieving
            the value they are converted to the local server's timezone. So to compare
            datetime object we need to create a object in the same timezone as the server.

            from_ts: seconds since epoch, in float
            to_ts: seconds since epoch, in float
            limit: the maximum number of items to return
            order: 0 for ASCending order, 1 for DESCending order
            time_range: the time range (in units of 5 days) to search for listens. If none is given
                        3 ranges (15 days) are searched. If -1 is given then all listens are searched
                        which is slow and should be avoided if at all possible.
        """

        if time_range is None:
            time_range = 3

        if time_range < 0:
            max_timestamp_window = -1
        else:
            max_timestamp_window = SECONDS_IN_TIME_RANGE * time_range
            if to_ts is None:
                to_ts = from_ts + max_timestamp_window
            elif from_ts is None:
                from_ts = to_ts - max_timestamp_window

        query = """SELECT listened_at, track_name, created, data, user_name
                     FROM listen
                    WHERE user_name IN :user_names """


        if max_timestamp_window < 0:
            if from_ts and to_ts:
                query += """AND listened_at > :from_ts
                            AND listened_at < :to_ts """
            elif from_ts is not None:
                query += "AND listened_at > :from_ts "
            else:
                query += "AND listened_at < :to_ts "
        else:
            query += """AND listened_at > :from_ts
                        AND listened_at < :to_ts """

        query += "ORDER BY listened_at " + ORDER_TEXT[order] + " LIMIT :limit"

        listens = []
        with timescale.engine.connect() as connection:
            curs = connection.execute(sqlalchemy.text(query), user_names=tuple(user_names), from_ts=from_ts, to_ts=to_ts, limit=limit)
            while True:
                result = curs.fetchone()
                if not result:
                    break

                listens.append(Listen.from_timescale(result[0], result[1], result[4], result[2], result[3]))

        if order == ORDER_ASC:
            listens.reverse()

        return listens
Exemple #3
0
    def write_incremental_listens(self, start_time, end_time, temp_dir):
        """ Dump listens in the format for the ListenBrainz dump.

        Args:
            start_time and end_time (datetime): the range of time for the listens dump.
            temp_dir (str): the dir to use to write files before adding to archive
        """
        t0 = time.monotonic()
        offset = 0
        listen_count = 0

        unwritten_listens = {}

        while True:
            query, args = self.get_incremental_listens_query_batch(
                start_time, end_time, offset)
            rows_added = 0
            with timescale.engine.connect() as connection:
                curs = connection.execute(sqlalchemy.text(query), args)
                while True:
                    result = curs.fetchone()
                    if not result:
                        break

                    listen = Listen.from_timescale(result[0], result[1],
                                                   result[2], result[3],
                                                   result[4]).to_json()
                    timestamp = listen['timestamp']

                    if timestamp.year not in unwritten_listens:
                        unwritten_listens[timestamp.year] = {}
                    if timestamp.month not in unwritten_listens[
                            timestamp.year]:
                        unwritten_listens[timestamp.year][timestamp.month] = []

                    unwritten_listens[timestamp.year][timestamp.month].append(
                        listen)
                    rows_added += 1

            if rows_added == 0:
                break

            listen_count += rows_added
            offset += DUMP_CHUNK_SIZE

        self.write_incremental_listens_to_disk(unwritten_listens, temp_dir)
        self.log.info("%d listens dumped at %.2f listens / sec", listen_count,
                      listen_count / (time.monotonic() - t0))
Exemple #4
0
    def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600):
        """ Fetch recent listens for a list of users, given a limit which applies per user. If you
            have a limit of 3 and 3 users you should get 9 listens if they are available.

            user_list: A list containing the users for which you'd like to retrieve recent listens.
            limit: the maximum number of listens for each user to fetch.
            max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds
        """

        args = {
            'user_list': tuple(user_list),
            'ts': int(time.time()) - max_age,
            'limit': limit
        }
        query = """SELECT * FROM (
                              SELECT listened_at, track_name, user_name, created, data, recording_mbid, release_mbid, artist_mbids,
                                     row_number() OVER (partition by user_name ORDER BY listened_at DESC) AS rownum
                                FROM listen l
                     FULL OUTER JOIN listen_join_listen_mbid_mapping lj
                                  ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = lj.recording_msid
                     FULL OUTER JOIN listen_mbid_mapping m
                                  ON lj.listen_mbid_mapping = m.id
                               WHERE user_name IN :user_list
                                 AND listened_at > :ts
                            GROUP BY user_name, listened_at, track_name, created, data, recording_mbid, release_mbid, artist_mbids
                            ORDER BY listened_at DESC) tmp
                               WHERE rownum <= :limit"""

        listens = []
        with timescale.engine.connect() as connection:
            curs = connection.execute(sqlalchemy.text(query), args)
            while True:
                result = curs.fetchone()
                if not result:
                    break

                listens.append(Listen.from_timescale(*result[0:8]))

        return listens
Exemple #5
0
    def write_listens(self,
                      temp_dir,
                      tar_file,
                      archive_name,
                      start_time_range=None,
                      end_time_range=None,
                      full_dump=True):
        """ Dump listens in the format for the ListenBrainz dump.

        Args:
            end_time_range (datetime): the range of time for the listens dump.
            temp_dir (str): the dir to use to write files before adding to archive
            full_dump (bool): the type of dump
        """
        t0 = time.monotonic()
        listen_count = 0

        # This right here is why we should ONLY be using seconds timestamps. Someone could
        # pass in a timezone aware timestamp (when listens have no timezones) or one without.
        # If you pass the wrong one and a test invokes a command line any failures are
        # invisible causing massive hair-pulling. F**K DATETIME.
        if start_time_range:
            start_time_range = datetime.utcfromtimestamp(
                datetime.timestamp(start_time_range))
        if end_time_range:
            end_time_range = datetime.utcfromtimestamp(
                datetime.timestamp(end_time_range))

        year = start_time_range.year
        month = start_time_range.month
        while True:
            start_time = datetime(year, month, 1)
            start_time = max(start_time_range, start_time)
            if start_time > end_time_range:
                break

            next_month = month + 1
            next_year = year
            if next_month > 12:
                next_month = 1
                next_year += 1

            end_time = datetime(next_year, next_month, 1)
            end_time = end_time - timedelta(seconds=1)
            if end_time > end_time_range:
                end_time = end_time_range

            filename = os.path.join(temp_dir, str(year), "%d.listens" % month)
            try:
                os.makedirs(os.path.join(temp_dir, str(year)))
            except FileExistsError:
                pass

            query, args = None, None
            if full_dump:
                query, args = self.get_listens_query_for_dump(
                    int(start_time.strftime('%s')),
                    int(end_time.strftime('%s')))
            else:
                query, args = self.get_incremental_listens_query(
                    start_time, end_time)

            rows_added = 0
            with timescale.engine.connect() as connection:
                curs = connection.execute(sqlalchemy.text(query), args)
                if curs.rowcount:
                    with open(filename, "w") as out_file:
                        while True:
                            result = curs.fetchone()
                            if not result:
                                break

                            listen = Listen.from_timescale(
                                result[0], result[1], result[2], result[3],
                                result[4]).to_json()
                            out_file.write(ujson.dumps(listen) + "\n")
                            rows_added += 1
                    tar_file.add(filename,
                                 arcname=os.path.join(archive_name, 'listens',
                                                      str(year),
                                                      "%d.listens" % month))

                    listen_count += rows_added
                    self.log.info("%d listens dumped for %s at %.2f listens/s",
                                  listen_count,
                                  start_time.strftime("%Y-%m-%d"),
                                  listen_count / (time.monotonic() - t0))

            month = next_month
            year = next_year
            rows_added = 0
Exemple #6
0
    def test_from_timescale(self):
        """ Test for the from_timescale method """

        timescale_row = {
            "listened_at": 1525557084,
            "created": 1525557084,
            "track_name": "Every Step Every Way",
            "user_name": "iliekcomputers",
            "data": {
                'track_metadata': {
                    "artist_name": "Majid Jordan",
                    "release_name": "Majid Jordan",
                    'additional_info': {
                        "artist_msid": "aa6130f2-a12d-47f3-8ffd-d0f71340de1f",
                        "artist_mbids": ["abaa7001-0d80-4e58-be5d-d2d246fd9d87"],
                        "release_msid": "cf138a00-05d5-4b35-8fce-181efcc15785",
                        'release_mbid': '8294645a-f996-44b6-9060-7f189b9f59f3',
                        "recording_mbid": None,
                        "recording_msid": "db9a7483-a8f4-4a2c-99af-c8ab58850200",
                        "tags": ["sing, song"],
                        "best_song": "definitely",
                        "genius_link": "https://genius.com/Majid-jordan-every-step-every-way-lyrics",
                        "lastfm_link": "https://www.last.fm/music/Majid+Jordan/_/Every+Step+Every+Way",
                        "other_stuff": "teststuffplsignore",
                        "we_dict_now.hello": "afb",
                        "we_dict_now.we_nested_now.hi": "312"
                    }
                },
                "user_id": 1
            }
        }

        listen = Listen.from_timescale(timescale_row['listened_at'],
                                       timescale_row['track_name'],
                                       timescale_row['user_name'],
                                       timescale_row['created'],
                                       timescale_row['data'])

        # Check user name
        self.assertEqual(listen.user_name, timescale_row['user_name'])

        # Check time stamp
        ts = timescale_row['listened_at']
        self.assertEqual(listen.ts_since_epoch, ts)

        # Check artist mbids
        self.assertEqual(listen.data['additional_info']['artist_mbids'],
                         timescale_row['data']['track_metadata']['additional_info']['artist_mbids'])

        # Check tags
        self.assertEqual(listen.data['additional_info']['tags'], timescale_row['data']
                         ['track_metadata']['additional_info']['tags'])

        # Check track name
        self.assertEqual(listen.data['track_name'], timescale_row['data']
                         ['track_metadata']['track_name'])

        # Check additional info
        self.assertEqual(listen.data['additional_info']['best_song'],
                         timescale_row['data']['track_metadata']['additional_info']['best_song'])

        # Check msids
        self.assertEqual(
            listen.artist_msid, timescale_row['data']['track_metadata']['additional_info']['artist_msid'])
        self.assertEqual(
            listen.release_msid, timescale_row['data']['track_metadata']['additional_info']['release_msid'])
        self.assertEqual(listen.data['track_name'], timescale_row['track_name'])

        # make sure additional info does not contain stuff like artist names, track names
        self.assertNotIn('track_name', listen.data['additional_info'])
        self.assertNotIn('artist_name', listen.data['additional_info'])
        self.assertNotIn('release_name', listen.data['additional_info'])
    def fetch_listens_for_multiple_users_from_storage(self,
                                                      user_names: List[str],
                                                      from_ts: float,
                                                      to_ts: float, limit: int,
                                                      order: int):
        """ The timestamps are stored as UTC in the postgres datebase while on retrieving
            the value they are converted to the local server's timezone. So to compare
            datetime object we need to create a object in the same timezone as the server.

            If neither from_ts nor to_ts is provided, the latest listens for the user are returned.
            Returns a tuple of (listens, min_user_timestamp, max_user_timestamp)

            from_ts: seconds since epoch, in float
            to_ts: seconds since epoch, in float
            limit: the maximum number of items to return
            order: 0 for DESCending order, 1 for ASCending order
        """

        min_user_ts = max_user_ts = None
        for user_name in user_names:
            min_ts, max_ts = self.get_timestamps_for_user(user_name)
            min_user_ts = min(min_ts, min_user_ts or min_ts)
            max_user_ts = max(max_ts, max_user_ts or max_ts)

        if to_ts is None and from_ts is None:
            to_ts = max_user_ts + 1

        if min_user_ts == 0 and max_user_ts == 0:
            return ([], min_user_ts, max_user_ts)

        window_size = DEFAULT_FETCH_WINDOW
        query = """SELECT listened_at, track_name, user_name, created, data, mm.recording_mbid, release_mbid, artist_mbids
                     FROM listen
          FULL OUTER JOIN mbid_mapping mm
                       ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = mm.recording_msid
          FULL OUTER JOIN mbid_mapping_metadata m
                       ON mm.recording_mbid = m.recording_mbid
                    WHERE user_name IN :user_names
                      AND listened_at > :from_ts
                      AND listened_at < :to_ts
                 ORDER BY listened_at """ + ORDER_TEXT[order] + " LIMIT :limit"

        if from_ts and to_ts:
            to_dynamic = False
            from_dynamic = False
        elif from_ts is not None:
            to_ts = from_ts + window_size
            to_dynamic = True
            from_dynamic = False
        else:
            from_ts = to_ts - window_size
            to_dynamic = False
            from_dynamic = True

        listens = []
        done = False
        with timescale.engine.connect() as connection:
            t0 = time.monotonic()

            passes = 0
            while True:
                passes += 1

                # Oh shit valve. I'm keeping it here for the time being. :)
                if passes == 10:
                    done = True
                    break

                curs = connection.execute(sqlalchemy.text(query),
                                          user_names=tuple(user_names),
                                          from_ts=from_ts,
                                          to_ts=to_ts,
                                          limit=limit)
                while True:
                    result = curs.fetchone()
                    if not result:
                        if not to_dynamic and not from_dynamic:
                            done = True
                            break

                        if from_ts < min_user_ts - 1:
                            done = True
                            break

                        if to_ts > int(
                                time.time()) + ListenStore.MAX_FUTURE_SECONDS:
                            done = True
                            break

                        if to_dynamic:
                            from_ts += window_size - 1
                            window_size *= WINDOW_SIZE_MULTIPLIER
                            to_ts += window_size

                        if from_dynamic:
                            to_ts -= window_size
                            window_size *= WINDOW_SIZE_MULTIPLIER
                            from_ts -= window_size

                        break

                    listens.append(Listen.from_timescale(*result))
                    if len(listens) == limit:
                        done = True
                        break

                if done:
                    break

            fetch_listens_time = time.monotonic() - t0

        if order == ORDER_ASC:
            listens.reverse()

        self.log.info("fetch listens %s %.2fs (%d passes)" %
                      (str(user_names), fetch_listens_time, passes))

        return (listens, min_user_ts, max_user_ts)
    def fetch_recent_listens_for_users(self,
                                       users,
                                       min_ts: int = None,
                                       max_ts: int = None,
                                       per_user_limit=2,
                                       limit=10):
        """ Fetch recent listens for a list of users, given a limit which applies per user. If you
            have a limit of 3 and 3 users you should get 9 listens if they are available.

            user_ids: A list containing the users for which you'd like to retrieve recent listens.
            min_ts: Only return listens with listened_at after this timestamp
            max_ts: Only return listens with listened_at before this timestamp
            per_user_limit: the maximum number of listens for each user to fetch
            limit: the maximum number of listens overall to fetch
        """
        user_id_map = {user["id"]: user["musicbrainz_id"] for user in users}

        filters_list = ["user_id IN :user_ids"]
        args = {
            "user_ids": tuple(user_id_map.keys()),
            "per_user_limit": per_user_limit,
            "limit": limit
        }
        if min_ts:
            filters_list.append("listened_at > :min_ts")
            args["min_ts"] = min_ts
        if max_ts:
            filters_list.append("listened_at < :max_ts")
            args["max_ts"] = max_ts
        filters = " AND ".join(filters_list)

        query = f"""SELECT * FROM (
                              SELECT listened_at, track_name, user_id, created, data, mm.recording_mbid, release_mbid, artist_mbids,
                                     row_number() OVER (partition by user_id ORDER BY listened_at DESC) AS rownum
                                FROM listen l
                     FULL OUTER JOIN mbid_mapping m
                                  ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = m.recording_msid
                     FULL OUTER JOIN mbid_mapping_metadata mm
                                  ON mm.recording_mbid = m.recording_mbid
                               WHERE {filters}
                            GROUP BY user_id, listened_at, track_name, created, data, mm.recording_mbid, release_mbid, artist_mbids
                            ORDER BY listened_at DESC) tmp
                               WHERE rownum <= :per_user_limit
                            ORDER BY listened_at DESC   
                               LIMIT :limit"""

        listens = []
        with timescale.engine.connect() as connection:
            curs = connection.execute(sqlalchemy.text(query), args)
            while True:
                result = curs.fetchone()
                if not result:
                    break
                user_name = user_id_map[result["user_id"]]
                listens.append(
                    Listen.from_timescale(
                        listened_at=result["listened_at"],
                        track_name=result["track_name"],
                        user_id=result["user_id"],
                        created=result["created"],
                        data=result["data"],
                        recording_mbid=result["recording_mbid"],
                        release_mbid=result["release_mbid"],
                        artist_mbids=result["artist_mbids"],
                        user_name=user_name))
        return listens
    def fetch_listens(self,
                      user: Dict,
                      from_ts: int = None,
                      to_ts: int = None,
                      limit: int = DEFAULT_LISTENS_PER_FETCH):
        """ The timestamps are stored as UTC in the postgres datebase while on retrieving
            the value they are converted to the local server's timezone. So to compare
            datetime object we need to create a object in the same timezone as the server.

            If neither from_ts nor to_ts is provided, the latest listens for the user are returned.
            Returns a tuple of (listens, min_user_timestamp, max_user_timestamp)

            from_ts: seconds since epoch, in float. if specified, listens will be returned in ascending order. otherwise
                listens will be returned in descending order
            to_ts: seconds since epoch, in float
            limit: the maximum number of items to return
        """
        if from_ts and to_ts and from_ts >= to_ts:
            raise ValueError("from_ts should be less than to_ts")
        if from_ts:
            order = ORDER_ASC
        else:
            order = ORDER_DESC

        min_user_ts, max_user_ts = self.get_timestamps_for_user(user["id"])

        if min_user_ts == 0 and max_user_ts == 0:
            return [], min_user_ts, max_user_ts

        if to_ts is None and from_ts is None:
            to_ts = max_user_ts + 1

        window_size = DEFAULT_FETCH_WINDOW
        query = """SELECT listened_at, track_name, user_id, created, data, mm.recording_mbid, release_mbid, artist_mbids
                     FROM listen
          FULL OUTER JOIN mbid_mapping mm
                       ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = mm.recording_msid
          FULL OUTER JOIN mbid_mapping_metadata m
                       ON mm.recording_mbid = m.recording_mbid
                    WHERE user_id = :user_id
                      AND listened_at > :from_ts
                      AND listened_at < :to_ts
                 ORDER BY listened_at """ + ORDER_TEXT[order] + " LIMIT :limit"

        if from_ts and to_ts:
            to_dynamic = False
            from_dynamic = False
        elif from_ts is not None:
            to_ts = from_ts + window_size
            to_dynamic = True
            from_dynamic = False
        else:
            from_ts = to_ts - window_size
            to_dynamic = False
            from_dynamic = True

        listens = []
        done = False
        with timescale.engine.connect() as connection:
            t0 = time.monotonic()

            passes = 0
            while True:
                passes += 1

                # Oh shit valve. I'm keeping it here for the time being. :)
                if passes == 10:
                    done = True
                    break

                curs = connection.execute(sqlalchemy.text(query),
                                          user_id=user["id"],
                                          from_ts=from_ts,
                                          to_ts=to_ts,
                                          limit=limit)
                while True:
                    result = curs.fetchone()
                    if not result:
                        if not to_dynamic and not from_dynamic:
                            done = True
                            break

                        if from_ts < min_user_ts - 1:
                            done = True
                            break

                        if to_ts > int(time.time()) + MAX_FUTURE_SECONDS:
                            done = True
                            break

                        if to_dynamic:
                            from_ts += window_size - 1
                            window_size *= WINDOW_SIZE_MULTIPLIER
                            to_ts += window_size

                        if from_dynamic:
                            to_ts -= window_size
                            window_size *= WINDOW_SIZE_MULTIPLIER
                            from_ts -= window_size

                        break

                    listens.append(
                        Listen.from_timescale(
                            listened_at=result["listened_at"],
                            track_name=result["track_name"],
                            user_id=result["user_id"],
                            created=result["created"],
                            data=result["data"],
                            recording_mbid=result["recording_mbid"],
                            release_mbid=result["release_mbid"],
                            artist_mbids=result["artist_mbids"],
                            user_name=user["musicbrainz_id"]))

                    if len(listens) == limit:
                        done = True
                        break

                if done:
                    break

            fetch_listens_time = time.monotonic() - t0

        if order == ORDER_ASC:
            listens.reverse()

        self.log.info("fetch listens %s %.2fs (%d passes)" %
                      (user["musicbrainz_id"], fetch_listens_time, passes))

        return listens, min_user_ts, max_user_ts