def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600): """ Fetch recent listens for a list of users, given a limit which applies per user. If you have a limit of 3 and 3 users you should get 9 listens if they are available. user_list: A list containing the users for which you'd like to retrieve recent listens. limit: the maximum number of listens for each user to fetch. max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds """ args = {'user_list': tuple(user_list), 'ts': int(time.time()) - max_age, 'limit': limit} query = """SELECT * FROM ( SELECT listened_at, track_name, user_name, created, data, row_number() OVER (partition by user_name ORDER BY listened_at DESC) AS rownum FROM listen WHERE user_name IN :user_list AND listened_at > :ts GROUP BY user_name, listened_at, track_name, created, data ORDER BY listened_at DESC) tmp WHERE rownum <= :limit""" listens = [] with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), args) while True: result = curs.fetchone() if not result: break listens.append(Listen.from_timescale(result[0], result[1], result[2], result[3], result[4])) return listens
def fetch_listens_for_multiple_users_from_storage(self, user_names: List[str], from_ts: float, to_ts: float, limit: int, order: int, time_range: int=3): """ The timestamps are stored as UTC in the postgres datebase while on retrieving the value they are converted to the local server's timezone. So to compare datetime object we need to create a object in the same timezone as the server. from_ts: seconds since epoch, in float to_ts: seconds since epoch, in float limit: the maximum number of items to return order: 0 for ASCending order, 1 for DESCending order time_range: the time range (in units of 5 days) to search for listens. If none is given 3 ranges (15 days) are searched. If -1 is given then all listens are searched which is slow and should be avoided if at all possible. """ if time_range is None: time_range = 3 if time_range < 0: max_timestamp_window = -1 else: max_timestamp_window = SECONDS_IN_TIME_RANGE * time_range if to_ts is None: to_ts = from_ts + max_timestamp_window elif from_ts is None: from_ts = to_ts - max_timestamp_window query = """SELECT listened_at, track_name, created, data, user_name FROM listen WHERE user_name IN :user_names """ if max_timestamp_window < 0: if from_ts and to_ts: query += """AND listened_at > :from_ts AND listened_at < :to_ts """ elif from_ts is not None: query += "AND listened_at > :from_ts " else: query += "AND listened_at < :to_ts " else: query += """AND listened_at > :from_ts AND listened_at < :to_ts """ query += "ORDER BY listened_at " + ORDER_TEXT[order] + " LIMIT :limit" listens = [] with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), user_names=tuple(user_names), from_ts=from_ts, to_ts=to_ts, limit=limit) while True: result = curs.fetchone() if not result: break listens.append(Listen.from_timescale(result[0], result[1], result[4], result[2], result[3])) if order == ORDER_ASC: listens.reverse() return listens
def write_incremental_listens(self, start_time, end_time, temp_dir): """ Dump listens in the format for the ListenBrainz dump. Args: start_time and end_time (datetime): the range of time for the listens dump. temp_dir (str): the dir to use to write files before adding to archive """ t0 = time.monotonic() offset = 0 listen_count = 0 unwritten_listens = {} while True: query, args = self.get_incremental_listens_query_batch( start_time, end_time, offset) rows_added = 0 with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), args) while True: result = curs.fetchone() if not result: break listen = Listen.from_timescale(result[0], result[1], result[2], result[3], result[4]).to_json() timestamp = listen['timestamp'] if timestamp.year not in unwritten_listens: unwritten_listens[timestamp.year] = {} if timestamp.month not in unwritten_listens[ timestamp.year]: unwritten_listens[timestamp.year][timestamp.month] = [] unwritten_listens[timestamp.year][timestamp.month].append( listen) rows_added += 1 if rows_added == 0: break listen_count += rows_added offset += DUMP_CHUNK_SIZE self.write_incremental_listens_to_disk(unwritten_listens, temp_dir) self.log.info("%d listens dumped at %.2f listens / sec", listen_count, listen_count / (time.monotonic() - t0))
def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600): """ Fetch recent listens for a list of users, given a limit which applies per user. If you have a limit of 3 and 3 users you should get 9 listens if they are available. user_list: A list containing the users for which you'd like to retrieve recent listens. limit: the maximum number of listens for each user to fetch. max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds """ args = { 'user_list': tuple(user_list), 'ts': int(time.time()) - max_age, 'limit': limit } query = """SELECT * FROM ( SELECT listened_at, track_name, user_name, created, data, recording_mbid, release_mbid, artist_mbids, row_number() OVER (partition by user_name ORDER BY listened_at DESC) AS rownum FROM listen l FULL OUTER JOIN listen_join_listen_mbid_mapping lj ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = lj.recording_msid FULL OUTER JOIN listen_mbid_mapping m ON lj.listen_mbid_mapping = m.id WHERE user_name IN :user_list AND listened_at > :ts GROUP BY user_name, listened_at, track_name, created, data, recording_mbid, release_mbid, artist_mbids ORDER BY listened_at DESC) tmp WHERE rownum <= :limit""" listens = [] with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), args) while True: result = curs.fetchone() if not result: break listens.append(Listen.from_timescale(*result[0:8])) return listens
def write_listens(self, temp_dir, tar_file, archive_name, start_time_range=None, end_time_range=None, full_dump=True): """ Dump listens in the format for the ListenBrainz dump. Args: end_time_range (datetime): the range of time for the listens dump. temp_dir (str): the dir to use to write files before adding to archive full_dump (bool): the type of dump """ t0 = time.monotonic() listen_count = 0 # This right here is why we should ONLY be using seconds timestamps. Someone could # pass in a timezone aware timestamp (when listens have no timezones) or one without. # If you pass the wrong one and a test invokes a command line any failures are # invisible causing massive hair-pulling. F**K DATETIME. if start_time_range: start_time_range = datetime.utcfromtimestamp( datetime.timestamp(start_time_range)) if end_time_range: end_time_range = datetime.utcfromtimestamp( datetime.timestamp(end_time_range)) year = start_time_range.year month = start_time_range.month while True: start_time = datetime(year, month, 1) start_time = max(start_time_range, start_time) if start_time > end_time_range: break next_month = month + 1 next_year = year if next_month > 12: next_month = 1 next_year += 1 end_time = datetime(next_year, next_month, 1) end_time = end_time - timedelta(seconds=1) if end_time > end_time_range: end_time = end_time_range filename = os.path.join(temp_dir, str(year), "%d.listens" % month) try: os.makedirs(os.path.join(temp_dir, str(year))) except FileExistsError: pass query, args = None, None if full_dump: query, args = self.get_listens_query_for_dump( int(start_time.strftime('%s')), int(end_time.strftime('%s'))) else: query, args = self.get_incremental_listens_query( start_time, end_time) rows_added = 0 with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), args) if curs.rowcount: with open(filename, "w") as out_file: while True: result = curs.fetchone() if not result: break listen = Listen.from_timescale( result[0], result[1], result[2], result[3], result[4]).to_json() out_file.write(ujson.dumps(listen) + "\n") rows_added += 1 tar_file.add(filename, arcname=os.path.join(archive_name, 'listens', str(year), "%d.listens" % month)) listen_count += rows_added self.log.info("%d listens dumped for %s at %.2f listens/s", listen_count, start_time.strftime("%Y-%m-%d"), listen_count / (time.monotonic() - t0)) month = next_month year = next_year rows_added = 0
def test_from_timescale(self): """ Test for the from_timescale method """ timescale_row = { "listened_at": 1525557084, "created": 1525557084, "track_name": "Every Step Every Way", "user_name": "iliekcomputers", "data": { 'track_metadata': { "artist_name": "Majid Jordan", "release_name": "Majid Jordan", 'additional_info': { "artist_msid": "aa6130f2-a12d-47f3-8ffd-d0f71340de1f", "artist_mbids": ["abaa7001-0d80-4e58-be5d-d2d246fd9d87"], "release_msid": "cf138a00-05d5-4b35-8fce-181efcc15785", 'release_mbid': '8294645a-f996-44b6-9060-7f189b9f59f3', "recording_mbid": None, "recording_msid": "db9a7483-a8f4-4a2c-99af-c8ab58850200", "tags": ["sing, song"], "best_song": "definitely", "genius_link": "https://genius.com/Majid-jordan-every-step-every-way-lyrics", "lastfm_link": "https://www.last.fm/music/Majid+Jordan/_/Every+Step+Every+Way", "other_stuff": "teststuffplsignore", "we_dict_now.hello": "afb", "we_dict_now.we_nested_now.hi": "312" } }, "user_id": 1 } } listen = Listen.from_timescale(timescale_row['listened_at'], timescale_row['track_name'], timescale_row['user_name'], timescale_row['created'], timescale_row['data']) # Check user name self.assertEqual(listen.user_name, timescale_row['user_name']) # Check time stamp ts = timescale_row['listened_at'] self.assertEqual(listen.ts_since_epoch, ts) # Check artist mbids self.assertEqual(listen.data['additional_info']['artist_mbids'], timescale_row['data']['track_metadata']['additional_info']['artist_mbids']) # Check tags self.assertEqual(listen.data['additional_info']['tags'], timescale_row['data'] ['track_metadata']['additional_info']['tags']) # Check track name self.assertEqual(listen.data['track_name'], timescale_row['data'] ['track_metadata']['track_name']) # Check additional info self.assertEqual(listen.data['additional_info']['best_song'], timescale_row['data']['track_metadata']['additional_info']['best_song']) # Check msids self.assertEqual( listen.artist_msid, timescale_row['data']['track_metadata']['additional_info']['artist_msid']) self.assertEqual( listen.release_msid, timescale_row['data']['track_metadata']['additional_info']['release_msid']) self.assertEqual(listen.data['track_name'], timescale_row['track_name']) # make sure additional info does not contain stuff like artist names, track names self.assertNotIn('track_name', listen.data['additional_info']) self.assertNotIn('artist_name', listen.data['additional_info']) self.assertNotIn('release_name', listen.data['additional_info'])
def fetch_listens_for_multiple_users_from_storage(self, user_names: List[str], from_ts: float, to_ts: float, limit: int, order: int): """ The timestamps are stored as UTC in the postgres datebase while on retrieving the value they are converted to the local server's timezone. So to compare datetime object we need to create a object in the same timezone as the server. If neither from_ts nor to_ts is provided, the latest listens for the user are returned. Returns a tuple of (listens, min_user_timestamp, max_user_timestamp) from_ts: seconds since epoch, in float to_ts: seconds since epoch, in float limit: the maximum number of items to return order: 0 for DESCending order, 1 for ASCending order """ min_user_ts = max_user_ts = None for user_name in user_names: min_ts, max_ts = self.get_timestamps_for_user(user_name) min_user_ts = min(min_ts, min_user_ts or min_ts) max_user_ts = max(max_ts, max_user_ts or max_ts) if to_ts is None and from_ts is None: to_ts = max_user_ts + 1 if min_user_ts == 0 and max_user_ts == 0: return ([], min_user_ts, max_user_ts) window_size = DEFAULT_FETCH_WINDOW query = """SELECT listened_at, track_name, user_name, created, data, mm.recording_mbid, release_mbid, artist_mbids FROM listen FULL OUTER JOIN mbid_mapping mm ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = mm.recording_msid FULL OUTER JOIN mbid_mapping_metadata m ON mm.recording_mbid = m.recording_mbid WHERE user_name IN :user_names AND listened_at > :from_ts AND listened_at < :to_ts ORDER BY listened_at """ + ORDER_TEXT[order] + " LIMIT :limit" if from_ts and to_ts: to_dynamic = False from_dynamic = False elif from_ts is not None: to_ts = from_ts + window_size to_dynamic = True from_dynamic = False else: from_ts = to_ts - window_size to_dynamic = False from_dynamic = True listens = [] done = False with timescale.engine.connect() as connection: t0 = time.monotonic() passes = 0 while True: passes += 1 # Oh shit valve. I'm keeping it here for the time being. :) if passes == 10: done = True break curs = connection.execute(sqlalchemy.text(query), user_names=tuple(user_names), from_ts=from_ts, to_ts=to_ts, limit=limit) while True: result = curs.fetchone() if not result: if not to_dynamic and not from_dynamic: done = True break if from_ts < min_user_ts - 1: done = True break if to_ts > int( time.time()) + ListenStore.MAX_FUTURE_SECONDS: done = True break if to_dynamic: from_ts += window_size - 1 window_size *= WINDOW_SIZE_MULTIPLIER to_ts += window_size if from_dynamic: to_ts -= window_size window_size *= WINDOW_SIZE_MULTIPLIER from_ts -= window_size break listens.append(Listen.from_timescale(*result)) if len(listens) == limit: done = True break if done: break fetch_listens_time = time.monotonic() - t0 if order == ORDER_ASC: listens.reverse() self.log.info("fetch listens %s %.2fs (%d passes)" % (str(user_names), fetch_listens_time, passes)) return (listens, min_user_ts, max_user_ts)
def fetch_recent_listens_for_users(self, users, min_ts: int = None, max_ts: int = None, per_user_limit=2, limit=10): """ Fetch recent listens for a list of users, given a limit which applies per user. If you have a limit of 3 and 3 users you should get 9 listens if they are available. user_ids: A list containing the users for which you'd like to retrieve recent listens. min_ts: Only return listens with listened_at after this timestamp max_ts: Only return listens with listened_at before this timestamp per_user_limit: the maximum number of listens for each user to fetch limit: the maximum number of listens overall to fetch """ user_id_map = {user["id"]: user["musicbrainz_id"] for user in users} filters_list = ["user_id IN :user_ids"] args = { "user_ids": tuple(user_id_map.keys()), "per_user_limit": per_user_limit, "limit": limit } if min_ts: filters_list.append("listened_at > :min_ts") args["min_ts"] = min_ts if max_ts: filters_list.append("listened_at < :max_ts") args["max_ts"] = max_ts filters = " AND ".join(filters_list) query = f"""SELECT * FROM ( SELECT listened_at, track_name, user_id, created, data, mm.recording_mbid, release_mbid, artist_mbids, row_number() OVER (partition by user_id ORDER BY listened_at DESC) AS rownum FROM listen l FULL OUTER JOIN mbid_mapping m ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = m.recording_msid FULL OUTER JOIN mbid_mapping_metadata mm ON mm.recording_mbid = m.recording_mbid WHERE {filters} GROUP BY user_id, listened_at, track_name, created, data, mm.recording_mbid, release_mbid, artist_mbids ORDER BY listened_at DESC) tmp WHERE rownum <= :per_user_limit ORDER BY listened_at DESC LIMIT :limit""" listens = [] with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), args) while True: result = curs.fetchone() if not result: break user_name = user_id_map[result["user_id"]] listens.append( Listen.from_timescale( listened_at=result["listened_at"], track_name=result["track_name"], user_id=result["user_id"], created=result["created"], data=result["data"], recording_mbid=result["recording_mbid"], release_mbid=result["release_mbid"], artist_mbids=result["artist_mbids"], user_name=user_name)) return listens
def fetch_listens(self, user: Dict, from_ts: int = None, to_ts: int = None, limit: int = DEFAULT_LISTENS_PER_FETCH): """ The timestamps are stored as UTC in the postgres datebase while on retrieving the value they are converted to the local server's timezone. So to compare datetime object we need to create a object in the same timezone as the server. If neither from_ts nor to_ts is provided, the latest listens for the user are returned. Returns a tuple of (listens, min_user_timestamp, max_user_timestamp) from_ts: seconds since epoch, in float. if specified, listens will be returned in ascending order. otherwise listens will be returned in descending order to_ts: seconds since epoch, in float limit: the maximum number of items to return """ if from_ts and to_ts and from_ts >= to_ts: raise ValueError("from_ts should be less than to_ts") if from_ts: order = ORDER_ASC else: order = ORDER_DESC min_user_ts, max_user_ts = self.get_timestamps_for_user(user["id"]) if min_user_ts == 0 and max_user_ts == 0: return [], min_user_ts, max_user_ts if to_ts is None and from_ts is None: to_ts = max_user_ts + 1 window_size = DEFAULT_FETCH_WINDOW query = """SELECT listened_at, track_name, user_id, created, data, mm.recording_mbid, release_mbid, artist_mbids FROM listen FULL OUTER JOIN mbid_mapping mm ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = mm.recording_msid FULL OUTER JOIN mbid_mapping_metadata m ON mm.recording_mbid = m.recording_mbid WHERE user_id = :user_id AND listened_at > :from_ts AND listened_at < :to_ts ORDER BY listened_at """ + ORDER_TEXT[order] + " LIMIT :limit" if from_ts and to_ts: to_dynamic = False from_dynamic = False elif from_ts is not None: to_ts = from_ts + window_size to_dynamic = True from_dynamic = False else: from_ts = to_ts - window_size to_dynamic = False from_dynamic = True listens = [] done = False with timescale.engine.connect() as connection: t0 = time.monotonic() passes = 0 while True: passes += 1 # Oh shit valve. I'm keeping it here for the time being. :) if passes == 10: done = True break curs = connection.execute(sqlalchemy.text(query), user_id=user["id"], from_ts=from_ts, to_ts=to_ts, limit=limit) while True: result = curs.fetchone() if not result: if not to_dynamic and not from_dynamic: done = True break if from_ts < min_user_ts - 1: done = True break if to_ts > int(time.time()) + MAX_FUTURE_SECONDS: done = True break if to_dynamic: from_ts += window_size - 1 window_size *= WINDOW_SIZE_MULTIPLIER to_ts += window_size if from_dynamic: to_ts -= window_size window_size *= WINDOW_SIZE_MULTIPLIER from_ts -= window_size break listens.append( Listen.from_timescale( listened_at=result["listened_at"], track_name=result["track_name"], user_id=result["user_id"], created=result["created"], data=result["data"], recording_mbid=result["recording_mbid"], release_mbid=result["release_mbid"], artist_mbids=result["artist_mbids"], user_name=user["musicbrainz_id"])) if len(listens) == limit: done = True break if done: break fetch_listens_time = time.monotonic() - t0 if order == ORDER_ASC: listens.reverse() self.log.info("fetch listens %s %.2fs (%d passes)" % (user["musicbrainz_id"], fetch_listens_time, passes)) return listens, min_user_ts, max_user_ts