def _get_stats_from_cache(): """Get submission statistics from cache""" stats = cache.get(STATS_CACHE_KEY, namespace=STATS_CACHE_NAMESPACE) last_collected = cache.get(STATS_CACHE_LAST_UPDATE_KEY, namespace=STATS_CACHE_NAMESPACE) return last_collected, stats
def test_delete_with_namespace(self): key = "testing" namespace = "spaaaaaaace" self.assertTrue(cache.set(key, u"Пример", namespace=namespace)) self.assertEqual(cache.get(key, namespace=namespace), u"Пример") self.assertEqual(cache.delete(key, namespace=namespace), 1) self.assertIsNone(cache.get(key, namespace=namespace))
def test_listen_counts_in_cache(self): count = self._create_test_data(self.testuser_name) self.assertEqual(count, self.logstore.get_listen_count_for_user(self.testuser_name, need_exact=True)) user_key = '{}{}'.format(REDIS_INFLUX_USER_LISTEN_COUNT, self.testuser_name) self.assertEqual(count, int(cache.get(user_key, decode=False))) batch = generate_data(self.testuser_id, self.testuser_name, int(time.time()), 1) self.logstore.insert(batch) self.assertEqual(count + 1, int(cache.get(user_key, decode=False)))
def test_datetime(self): self.assertTrue(cache.set('some_time', datetime.datetime.now())) self.assertEqual(type(cache.get('some_time')), datetime.datetime) dictionary = { "id": 1, "created": datetime.datetime.now(), } self.assertTrue(cache.set('some_other_time', dictionary)) self.assertEqual(cache.get('some_other_time'), dictionary)
def test_single_dict_fancy(self): dictionary = { "fancy": u"Да", "тест": 11, } cache.set('some_dict', dictionary) self.assertEqual(cache.get('some_dict'), dictionary)
def test_single_dict(self): dictionary = { "fancy": "yeah", "wow": 11, } self.assertTrue(cache.set('some_dict', dictionary)) self.assertEqual(cache.get('some_dict'), dictionary)
def mappings(mbid=None): """Get mappings to Spotify for a specified MusicBrainz ID. Returns: List containing Spotify URIs that are mapped to specified MBID. """ if _base_url is None: flash.warn(lazy_gettext(_UNAVAILABLE_MSG)) return [] data = cache.get(mbid, _CACHE_NAMESPACE) if not data: try: session = requests.Session() session.mount(_base_url, HTTPAdapter(max_retries=2)) resp = session.post( url=_base_url + 'mapping', headers={'Content-Type': 'application/json'}, data=json.dumps({'mbid': mbid}), ) resp.raise_for_status() data = resp.json().get('mappings') except RequestException: flash.warn(lazy_gettext("Spotify mapping server is unavailable. You will not see an embedded player.")) return [] cache.set(key=mbid, namespace=_CACHE_NAMESPACE, val=data) return data
def get_last_submitted_recordings(): """Get list of last submitted recordings. Returns: List of dictionaries with basic info about last submitted recordings: mbid (MusicBrainz ID), artist (name), and title. """ cache_key = "last-submitted-recordings" last_submissions = cache.get(cache_key) if not last_submissions: with db.engine.connect() as connection: # We are getting results with of offset of 10 rows because we'd # prefer to show recordings for which we already calculated # high-level data. This might not be the best way to do that. result = connection.execute("""SELECT ll.gid, llj.data->'metadata'->'tags'->'artist'->>0, llj.data->'metadata'->'tags'->'title'->>0 FROM lowlevel ll JOIN lowlevel_json llj ON ll.id = llj.id ORDER BY ll.id DESC LIMIT 5 OFFSET 10""") last_submissions = result.fetchall() last_submissions = [ { "mbid": str(r[0]), "artist": r[1], "title": r[2], } for r in last_submissions if r[1] and r[2] ] cache.set(cache_key, last_submissions, time=LAST_MBIDS_CACHE_TIMEOUT) return last_submissions
def browse_release_groups(*, artist_id, release_types=None, limit=None, offset=None): """Get all release groups linked to an artist. Args: artist_id (uuid): MBID of the artist. release_types (list): List of types of release groups to be fetched. limit (int): Max number of release groups to return. offset (int): Offset that can be used in conjunction with the limit. Returns: Tuple containing the list of dictionaries of release groups ordered by release year and the total count of the release groups. """ artist_id = str(artist_id) includes_data = defaultdict(dict) if release_types is None: release_types = [] release_types = [release_type.capitalize() for release_type in release_types] key = cache.gen_key(artist_id, limit, offset, *release_types) release_groups = cache.get(key) if not release_groups: with mb_session() as db: release_groups_query = _browse_release_groups_query(db, artist_id, release_types) count = release_groups_query.count() release_groups = release_groups_query.order_by( case([(models.ReleaseGroupMeta.first_release_date_year.is_(None), 1)], else_=0), models.ReleaseGroupMeta.first_release_date_year.desc() ).limit(limit).offset(offset).all() for release_group in release_groups: includes_data[release_group.id]['meta'] = release_group.meta release_groups = ([to_dict_release_groups(release_group, includes_data[release_group.id]) for release_group in release_groups], count) cache.set(key=key, val=release_groups, time=DEFAULT_CACHE_EXPIRATION) return release_groups
def get_release_group_by_id(mbid): """Get release group using the MusicBrainz ID.""" key = cache.gen_key(mbid) release_group = cache.get(key) if not release_group: release_group = _get_release_group_by_id(mbid) cache.set(key=key, val=release_group, time=DEFAULT_CACHE_EXPIRATION) return release_group_rel.process(release_group)
def get_recording_by_id(mbid): mbid = str(mbid) recording = cache.get(mbid) if not recording: try: recording = musicbrainzngs.get_recording_by_id(mbid, includes=['artists', 'releases', 'media'])['recording'] except ResponseError as e: raise DataUnavailable(e) cache.set(mbid, recording, time=CACHE_TIMEOUT) return recording
def get_total_listen_count(self, cache_value=True): """ Returns the total number of listens stored in the ListenStore. First checks the brainzutils cache for the value, if not present there makes a query to the db and caches it in brainzutils cache. """ if cache_value: count = cache.get(InfluxListenStore.REDIS_INFLUX_TOTAL_LISTEN_COUNT, decode=False) if count: return int(count) try: result = self.influx.query("""SELECT %s FROM "%s" ORDER BY time DESC LIMIT 1""" % (COUNT_MEASUREMENT_NAME, TIMELINE_COUNT_MEASUREMENT)) except (InfluxDBServerError, InfluxDBClientError) as err: self.log.error("Cannot query influx: %s" % str(err), exc_info=True) raise try: item = result.get_points(measurement=TIMELINE_COUNT_MEASUREMENT).__next__() count = int(item[COUNT_MEASUREMENT_NAME]) timestamp = convert_to_unix_timestamp(item['time']) except (KeyError, ValueError, StopIteration): timestamp = 0 count = 0 # Now sum counts that have been added in the interval we're interested in try: result = self.influx.query("""SELECT sum(%s) as total FROM "%s" WHERE time > %s""" % (COUNT_MEASUREMENT_NAME, TEMP_COUNT_MEASUREMENT, get_influx_query_timestamp(timestamp))) except (InfluxDBServerError, InfluxDBClientError) as err: self.log.error("Cannot query influx: %s" % str(err), exc_info=True) raise try: data = result.get_points(measurement=TEMP_COUNT_MEASUREMENT).__next__() count += int(data['total']) except StopIteration: pass if cache_value: cache.set( InfluxListenStore.REDIS_INFLUX_TOTAL_LISTEN_COUNT, int(count), InfluxListenStore.TOTAL_LISTEN_COUNT_CACHE_TIME, encode=False, ) return count
def get_place_by_id(mbid): """Get place with the MusicBrainz ID. Args: mbid (uuid): MBID(gid) of the place. Returns: Dictionary containing the place information. """ key = cache.gen_key(mbid) place = cache.get(key) if not place: place = _get_place_by_id(mbid) cache.set(key=key, val=place, time=DEFAULT_CACHE_EXPIRATION) return place_rel.process(place)
def get_event_by_id(mbid): """Get event with the MusicBrainz ID. Args: mbid (uuid): MBID(gid) of the event. Returns: Dictionary containing the event information. """ key = cache.gen_key(mbid) event = cache.get(key) if not event: event = _get_event_by_id(mbid) cache.set(key=key, val=event, time=DEFAULT_CACHE_EXPIRATION) return event
def get_release_by_id(mbid): """Get release with the MusicBrainz ID. Args: mbid (uuid): MBID(gid) of the release. Returns: Dictionary containing the release information. """ key = cache.gen_key(mbid) release = cache.get(key) if not release: release = _get_release_by_id(mbid) cache.set(key=key, val=release, time=DEFAULT_CACHE_EXPIRATION) return release
def create_record(cls, access_token, ip_address): """Creates new access log record with a current timestamp. It also checks if `DIFFERENT_IP_LIMIT` is exceeded within current time and `CLEANUP_RANGE_MINUTES`, alerts admins if that's the case. Args: access_token: Access token used to access the API. ip_address: IP access used to access the API. Returns: New access log record. """ new_record = cls( token=access_token, ip_address=ip_address, ) db.session.add(new_record) db.session.commit() # Checking if HOURLY_ALERT_THRESHOLD is exceeded count = cls.query \ .distinct(cls.ip_address) \ .filter(cls.timestamp > datetime.now(pytz.utc) - timedelta(minutes=CLEANUP_RANGE_MINUTES), cls.token == access_token) \ .count() if count > DIFFERENT_IP_LIMIT: msg = ("Hourly access threshold exceeded for token %s\n\n" "This token has been used from %s different IP " "addresses during the last %s minutes.") % \ (access_token, count, CLEANUP_RANGE_MINUTES) logging.info(msg) # Checking if notification for admins about this token abuse has # been sent in the last hour. This info is kept in cache. key = "alert_sent_%s" % access_token if not cache.get(key): send_mail( subject="[MetaBrainz] Hourly access threshold exceeded", recipients=current_app.config['NOTIFICATION_RECIPIENTS'], text=msg, ) cache.set(key, True, 3600) # 1 hour return new_record
def get_timestamps_for_user(self, user_name): """ Return the max_ts and min_ts for a given user and cache the result in brainzutils cache """ tss = cache.get(REDIS_USER_TIMESTAMPS % user_name) if tss: (min_ts, max_ts) = tss.split(",") min_ts = int(min_ts) max_ts = int(max_ts) else: query = 'SELECT first(artist_msid) FROM ' + get_escaped_measurement_name(user_name) min_ts = self._select_single_timestamp(query, get_measurement_name(user_name)) query = 'SELECT last(artist_msid) FROM ' + get_escaped_measurement_name(user_name) max_ts = self._select_single_timestamp(query, get_measurement_name(user_name)) cache.set(REDIS_USER_TIMESTAMPS % user_name, "%d,%d" % (min_ts, max_ts), USER_CACHE_TIME) return min_ts, max_ts
def insert(self, listens): """ Insert a batch of listens. """ submit = [] user_names = {} for listen in listens: user_names[listen.user_name] = 1 submit.append(listen.to_influx(quote(listen.user_name))) if not self.influx.write_points(submit, time_precision='s'): self.log.error("Cannot write data to influx. (write_points returned False), data=%s", json.dumps(submit, indent=3)) # If we reach this point, we were able to write the listens to the InfluxListenStore. # So update the listen counts of the users cached in brainzutils cache. for data in submit: user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, data['fields']['user_name']) cached_count = cache.get(user_key, decode=False) if cached_count: cache.increment(user_key) # Invalidate cached data for user for user_name in user_names.keys(): cache.delete(REDIS_USER_TIMESTAMPS % user_name) if len(listens): # Enter a measurement to count items inserted submit = [{ 'measurement': TEMP_COUNT_MEASUREMENT, 'tags': { COUNT_MEASUREMENT_NAME: len(listens) }, 'fields': { COUNT_MEASUREMENT_NAME: len(listens) } }] try: if not self.influx.write_points(submit): self.log.error("Cannot write listen cound to influx. (write_points returned False)") except (InfluxDBServerError, InfluxDBClientError, ValueError) as err: self.log.error("Cannot write data to influx: %s, data: %s", str(err), json.dumps(submit, indent=3), exc_info=True) raise
def get_listen_count_for_user(self, user_name, need_exact=False): """Get the total number of listens for a user. The number of listens comes from brainzutils cache unless an exact number is asked for. Args: user_name: the user to get listens for need_exact: if True, get an exact number of listens directly from the ListenStore """ if not need_exact: # check if the user's listen count is already in cache # if already present return it directly instead of calculating it again # decode is set to False as we have not encoded the value when we set it # in brainzutils cache as we need to call increment operation which requires # an integer value user_key = '{}{}'.format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name) count = cache.get(user_key, decode=False) if count: return int(count) try: results = self.influx.query( 'SELECT count(*) FROM ' + get_escaped_measurement_name(user_name)) except (InfluxDBServerError, InfluxDBClientError) as e: self.log.error("Cannot query influx: %s" % str(e), exc_info=True) raise # get the number of listens from the json try: count = results.get_points(measurement=get_measurement_name( user_name)).__next__()['count_recording_msid'] except (KeyError, StopIteration): count = 0 # put this value into brainzutils cache with an expiry time user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name) cache.set(user_key, int(count), InfluxListenStore.USER_LISTEN_COUNT_CACHE_TIME, encode=False) return int(count)
def insert(self, listens): """ Insert a batch of listens. Returns a list of (listened_at, track_name, user_name) that indicates which rows were inserted into the DB. If the row is not listed in the return values, it was a duplicate. """ submit = [] user_names = {} for listen in listens: user_names[listen.user_name] = 1 submit.append(listen.to_timescale()) query = """INSERT INTO listen (listened_at, track_name, user_name, data) VALUES %s ON CONFLICT (listened_at, track_name, user_name) DO NOTHING RETURNING listened_at, track_name, user_name""" inserted_rows = [] conn = timescale.engine.raw_connection() with conn.cursor() as curs: execute_values(curs, query, submit, template=None) while True: result = curs.fetchone() if not result: break inserted_rows.append((result[0], result[1], result[2])) conn.commit() # So update the listen counts of the users cached in brainzutils cache. for _, _, user_name in inserted_rows: user_key = "{}{}".format(self.ns + REDIS_TIMESCALE_USER_LISTEN_COUNT, user_name) cached_count = cache.get(user_key, decode=False) if cached_count: cache.increment(user_key) # Invalidate cached data for user for user_name in user_names: cache.delete(self.ns + REDIS_USER_TIMESTAMPS % user_name) return inserted_rows
def get_place_by_id(id): """Get event with the MusicBrainz ID. Returns: Event object with the following includes: artist-rels, place-rels, series-rels, url-rels. """ key = cache.gen_key(id) place = cache.get(key) if not place: try: place = musicbrainzngs.get_place_by_id(id, [ 'artist-rels', 'place-rels', 'release-group-rels', 'url-rels' ]).get('place') except ResponseError as e: if e.cause.code == 404: return None else: raise InternalServerError(e.cause.msg) cache.set(key=key, val=place, time=DEFAULT_CACHE_EXPIRATION) return place
def get_place_by_id(mbid): """Get place with the MusicBrainz ID. Args: mbid (uuid): MBID(gid) of the place. Returns: Dictionary containing the place information. """ key = cache.gen_key('place', mbid) place = cache.get(key) if not place: place = db.get_place_by_id( mbid, includes=[ 'artist-rels', 'place-rels', 'release-group-rels', 'url-rels' ], unknown_entities_for_missing=True, ) cache.set(key=key, val=place, time=DEFAULT_CACHE_EXPIRATION) return place_rel.process(place)
def get_event_by_id(mbid): """Get event with the MusicBrainz ID. Args: mbid (uuid): MBID(gid) of the event. Returns: Dictionary containing the event information. """ key = cache.gen_key('event', mbid) event = cache.get(key) if not event: event = db.fetch_multiple_events( [mbid], includes=[ 'artist-rels', 'place-rels', 'series-rels', 'url-rels', 'release-group-rels' ], ).get(mbid) cache.set(key=key, val=event, time=DEFAULT_CACHE_EXPIRATION) return event
def get_timestamps_for_user(self, user_name): """ Return the max_ts and min_ts for a given user and cache the result in brainzutils cache """ tss = cache.get(REDIS_USER_TIMESTAMPS + user_name) if tss: (min_ts, max_ts) = tss.split(",") min_ts = int(min_ts) max_ts = int(max_ts) else: t0 = time.monotonic() min_ts = self._select_single_timestamp(True, user_name) max_ts = self._select_single_timestamp(False, user_name) cache.set(REDIS_USER_TIMESTAMPS + user_name, "%d,%d" % (min_ts, max_ts), expirein=0) # intended for production monitoring self.log.info("timestamps %s %.2fs" % (user_name, time.monotonic() - t0)) return min_ts, max_ts
def _fetch_access_token(refresh=False) -> str: """Get an access token from the OAuth credentials. https://developer.spotify.com/web-api/authorization-guide/#client-credentials-flow """ key = cache.gen_key("spotify_oauth_access_token") access_token = cache.get(key) if refresh or not access_token: client_id = app.config.get("SPOTIFY_CLIENT_ID") client_secret = app.config.get("SPOTIFY_CLIENT_SECRET") auth_value = b64encode(bytes(f"{client_id}:{client_secret}", "utf-8")).decode("utf-8") response = requests.post( "https://accounts.spotify.com/api/token", data={"grant_type": "client_credentials"}, headers={"Authorization": f"Basic {auth_value}"}, ).json() access_token = response.get("access_token") if not access_token: raise SpotifyException("Could not fetch access token for Spotify API") # Making the token stored in cache expire at the same time as the actual token cache.set(key=key, val=access_token, time=response.get("expires_in", 10)) return access_token
def lookup_ips(users): """ Try to lookup and cache as many reverse DNS as possible in a window of time """ data = [] timeout = time.monotonic() + StatsView.IP_ADDR_TIMEOUT for user in users: row = list(user) reverse = cache.get(user[0]) if not reverse: if time.monotonic() < timeout: reverse = StatsView.dns_lookup(user[0]) else: reverse = None if reverse: cache.set(user[0], reverse, 3600) row[0] = reverse data.append(row) return data
def get_top_users_overall(): """ Gets top contributors since the beginning Returns: Returns: List of dictionaries where each dictionary has the following structure: { "id": (str), "display_name": (str), "review_count": (int), "comment_count": (int), "vote_count": (int), "score": (int), } """ key = cache.gen_key("top_users_overall", _CACHE_NAMESPACE) top_users = cache.get(key, _CACHE_NAMESPACE) # if could not fetch results from cache, or fetched results have to be updated if not top_users: try: results = get_top_users( review_weight=5, comment_weight=2, vote_weight=1, ) top_users = { "users": results, } cache.set(key=key, val=top_users, namespace=_CACHE_NAMESPACE, time=_DEFAULT_CACHE_EXPIRATION) except db_exceptions.NoDataFoundException: return None return top_users["users"]
def get_release_group_by_id(id): """Get release group with the MusicBrainz ID. Returns: Release group object with the following includes: artists, releases, release-group-rels, url-rels, work-rels. """ key = cache.gen_key(id) release_group = cache.get(key) if not release_group: try: release_group = musicbrainzngs.get_release_group_by_id( id, [ 'artists', 'releases', 'release-group-rels', 'url-rels', 'work-rels', 'tags' ]).get('release-group') except ResponseError as e: if e.cause.code == 404: return None else: raise InternalServerError(e.cause.msg) cache.set(key=key, val=release_group, time=DEFAULT_CACHE_EXPIRATION) return release_group_rel.process(release_group)
def get_listen_count_for_user(self, user_name, need_exact=False): """Get the total number of listens for a user. The number of listens comes from brainzutils cache unless an exact number is asked for. Args: user_name: the user to get listens for need_exact: if True, get an exact number of listens directly from the ListenStore """ if not need_exact: # check if the user's listen count is already in cache # if already present return it directly instead of calculating it again # decode is set to False as we have not encoded the value when we set it # in brainzutils cache as we need to call increment operation which requires # an integer value user_key = '{}{}'.format(self.ns + REDIS_TIMESCALE_USER_LISTEN_COUNT, user_name) count = cache.get(user_key, decode=False) if count: return int(count) query = "SELECT SUM(count) FROM listen_count WHERE user_name = :user_name" try: with timescale.engine.connect() as connection: result = connection.execute(sqlalchemy.text(query), { "user_name": user_name, }) count = int(result.fetchone()[0] or 0) except psycopg2.OperationalError as e: self.log.error("Cannot query timescale listen_count: %s" % str(e), exc_info=True) raise # put this value into brainzutils cache with an expiry time user_key = "{}{}".format(self.ns + REDIS_TIMESCALE_USER_LISTEN_COUNT, user_name) cache.set(user_key, count, TimescaleListenStore.USER_LISTEN_COUNT_CACHE_TIME, encode=False) return count
def get_timestamps_for_user(self, user_name): """ Return the max_ts and min_ts for a given user and cache the result in brainzutils cache """ tss = cache.get(REDIS_USER_TIMESTAMPS % user_name) if tss: (min_ts, max_ts) = tss.split(",") min_ts = int(min_ts) max_ts = int(max_ts) else: query = 'SELECT first(artist_msid) FROM ' + get_escaped_measurement_name( user_name) min_ts = self._select_single_timestamp( query, get_measurement_name(user_name)) query = 'SELECT last(artist_msid) FROM ' + get_escaped_measurement_name( user_name) max_ts = self._select_single_timestamp( query, get_measurement_name(user_name)) cache.set(REDIS_USER_TIMESTAMPS % user_name, "%d,%d" % (min_ts, max_ts), USER_CACHE_TIME) return min_ts, max_ts
def get_listen_count_for_user(self, user_name, need_exact=False): """Get the total number of listens for a user. The number of listens comes from brainzutils cache unless an exact number is asked for. Args: user_name: the user to get listens for need_exact: if True, get an exact number of listens directly from the ListenStore """ if not need_exact: # check if the user's listen count is already in cache # if already present return it directly instead of calculating it again # decode is set to False as we have not encoded the value when we set it # in brainzutils cache as we need to call increment operation which requires # an integer value user_key = '{}{}'.format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name) count = cache.get(user_key, decode=False) if count: return int(count) try: results = self.influx.query('SELECT count(*) FROM ' + get_escaped_measurement_name(user_name)) except (InfluxDBServerError, InfluxDBClientError) as e: self.log.error("Cannot query influx: %s" % str(e), exc_info=True) raise # get the number of listens from the json try: count = results.get_points(measurement = get_measurement_name(user_name)).__next__()['count_recording_msid'] except (KeyError, StopIteration): count = 0 # put this value into brainzutils cache with an expiry time user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name) cache.set(user_key, int(count), InfluxListenStore.USER_LISTEN_COUNT_CACHE_TIME, encode=False) return int(count)
def get_total_listen_count(self): """ Returns the total number of listens stored in the ListenStore. First checks the brainzutils cache for the value, if not present there makes a query to the db and caches it in brainzutils cache. """ count = cache.get(REDIS_TOTAL_LISTEN_COUNT) if count: return count query = "SELECT SUM(count) AS value FROM listen_user_metadata" try: with timescale.engine.connect() as connection: result = connection.execute(sqlalchemy.text(query)) # psycopg2 returns the `value` as a DECIMAL type which is not recognized # by msgpack/redis. so cast to python int first. count = int(result.fetchone()["value"] or 0) except psycopg2.OperationalError: self.log.error("Cannot query listen counts:", exc_info=True) raise cache.set(REDIS_TOTAL_LISTEN_COUNT, count, expirein=REDIS_USER_LISTEN_COUNT_EXPIRY) return count
def test_single_no_encode(self): self.assertTrue(cache.set("no encode", 1, encode=False)) self.assertEqual(cache.get("no encode", decode=False), b"1")
def get_popular(cls, limit=None): """Get list of popular reviews. Popularity is determined by rating of a particular review. Rating is a difference between positive votes and negative. In this case only votes from the last month are used to calculate rating. Results are cached for 12 hours. Args: limit: Maximum number of reviews to return. Returns: Randomized list of popular reviews which are converted into dictionaries using to_dict method. """ cache_key = cache.gen_key('popular_reviews', limit) reviews = cache.get(cache_key, Review.CACHE_NAMESPACE) if not reviews: # Selecting reviews for distinct release groups # TODO(roman): The is a problem with selecting popular reviews like # this: if there are multiple reviews for a release group we don't # choose the most popular. distinct_subquery = db.session.query(Review) \ .filter(Review.is_draft == False) \ .distinct(Review.entity_id).subquery() # Randomizing results to get some variety rand_subquery = db.session.query(aliased(Review, distinct_subquery)) \ .order_by(func.random()).subquery() # Sorting reviews by rating query = db.session.query(aliased(Review, rand_subquery)) # Preparing base query for getting votes vote_query_base = db.session.query( Vote.revision_id, Vote.vote, func.count().label('c')) \ .group_by(Vote.revision_id, Vote.vote) \ .filter(Vote.rated_at > datetime.now() - timedelta(weeks=4)) # Getting positive votes votes_pos = vote_query_base.subquery('votes_pos') query = query.outerjoin(Revision).outerjoin( votes_pos, and_(votes_pos.c.revision_id == Revision.id, votes_pos.c.vote == True)) # Getting negative votes votes_neg = vote_query_base.subquery('votes_neg') query = query.outerjoin(Revision).outerjoin( votes_neg, and_(votes_neg.c.revision_id == Revision.id, votes_neg.c.vote == False)) query = query.order_by( desc( func.coalesce(votes_pos.c.c, 0) - func.coalesce(votes_neg.c.c, 0))) if limit is not None: # Selecting more reviews there so we'll have something # different to show (shuffling is done below). query = query.limit(limit * 4) reviews = query.all() reviews = [review.to_dict(confidential=True) for review in reviews] cache.set(cache_key, reviews, 1 * 60 * 60, Review.CACHE_NAMESPACE) # 1 hour shuffle(reviews) # a bit more variety return reviews[:limit]
def add_legacy_listens_to_queue(self): """Fetch more legacy listens from the listens table by doing an left join on the matched listens, finding the next chunk of legacy listens to look up. Listens are added to the queue with a low priority.""" # Check to see where we need to pick up from, or start new if not self.legacy_listens_index_date: dt = cache.get(LEGACY_LISTENS_INDEX_DATE_CACHE_KEY, decode=False) or b"" try: self.legacy_listens_index_date = int( datetime.datetime.strptime(str(dt, "utf-8"), "%Y-%m-%d").timestamp()) self.app.logger.info("Loaded date index from cache: %d %s" % ( self.legacy_listens_index_date, str(dt))) except ValueError: self.legacy_listens_index_date = int( datetime.datetime.now().timestamp()) self.app.logger.info("Use date index now()") # Check to see if we're done if self.legacy_listens_index_date < DATA_START_YEAR_IN_SECONDS - LEGACY_LISTENS_LOAD_WINDOW: self.app.logger.info( "Finished looking up all legacy listens! Wooo!") self.legacy_next_run = monotonic() + UNMATCHED_LISTENS_COMPLETED_TIMEOUT self.legacy_listens_index_date = int(datetime.datetime.now().timestamp()) self.num_legacy_listens_loaded = 0 dt = datetime.datetime.fromtimestamp(self.legacy_listens_index_date) cache.set(LEGACY_LISTENS_INDEX_DATE_CACHE_KEY, dt.strftime("%Y-%m-%d"), expirein=0, encode=False) return # Load listens self.app.logger.info("Load more legacy listens for %s" % datetime.datetime.fromtimestamp( self.legacy_listens_index_date).strftime("%Y-%m-%d")) query = """SELECT data->'track_metadata'->'additional_info'->>'recording_msid'::TEXT AS recording_msid, track_name, data->'track_metadata'->'artist_name' AS artist_name FROM listen LEFT JOIN listen_join_listen_mbid_mapping lj ON data->'track_metadata'->'additional_info'->>'recording_msid' = lj.recording_msid::text WHERE lj.recording_msid IS NULL AND listened_at <= :max_ts AND listened_at > :min_ts""" count = 0 with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), max_ts=self.legacy_listens_index_date, min_ts=self.legacy_listens_index_date - LEGACY_LISTENS_LOAD_WINDOW) while True: result = curs.fetchone() if not result: break self.queue.put(JobItem(LEGACY_LISTEN, [{"data": {"artist_name": result[2], "track_name": result[1]}, "recording_msid": result[0], "legacy": True}])) count += 1 # update cache entry and count self.legacy_listens_index_date -= LEGACY_LISTENS_LOAD_WINDOW dt = datetime.datetime.fromtimestamp(self.legacy_listens_index_date) cache.set(LEGACY_LISTENS_INDEX_DATE_CACHE_KEY, dt.strftime("%Y-%m-%d"), expirein=0, encode=False) self.num_legacy_listens_loaded = count
def test_single_with_namespace(self): self.assertTrue(cache.set("test", 42, namespace="testing")) self.assertEqual(cache.get("test", namespace="testing"), 42)
def get_popular(limit=None): """Get a list of popular reviews. Popularity is determined by 'popularity' of a particular review. popularity is a difference between positive votes and negative. In this case only votes from the last month are used to calculate popularity to make results more varied. Args: limit (int): Maximum number of reviews to return. Returns: Randomized list of popular reviews which are converted into dictionaries using to_dict method. """ cache_key = cache.gen_key("popular_reviews", limit) reviews = cache.get(cache_key, REVIEW_CACHE_NAMESPACE) defined_limit = 4 * limit if limit else None if not reviews: with db.engine.connect() as connection: results = connection.execute(sqlalchemy.text(""" SELECT review.id, review.entity_id, review.entity_type, review.user_id, review.edits, review.is_draft, review.is_hidden, review.license_id, review.language, review.source, review.source_url, SUM( CASE WHEN vote = 't' THEN 1 WHEN vote = 'f' THEN -1 WHEN vote IS NULL THEN 0 END ) AS popularity, latest_revision.id AS latest_revision_id, latest_revision.timestamp AS latest_revision_timestamp, latest_revision.text AS text, latest_revision.rating AS rating FROM review JOIN revision ON revision.review_id = review.id LEFT JOIN ( SELECT revision_id, vote FROM vote WHERE rated_at > :last_month ) AS votes_last_month ON votes_last_month.revision_id = revision.id JOIN ( revision JOIN ( SELECT review.id AS review_uuid, MAX(timestamp) AS latest_timestamp FROM review JOIN revision ON review.id = review_id GROUP BY review.id ) AS latest ON latest.review_uuid = revision.review_id AND latest.latest_timestamp = revision.timestamp ) AS latest_revision ON review.id = latest_revision.review_id WHERE entity_id IN ( SELECT DISTINCT entity_id FROM ( SELECT entity_id FROM review ORDER BY RANDOM() ) AS randomized_entity_ids ) AND latest_revision.text IS NOT NULL AND review.is_hidden = 'f' AND review.is_draft = 'f' GROUP BY review.id, latest_revision.id ORDER BY popularity LIMIT :limit """), { "limit": defined_limit, "last_month": datetime.now() - timedelta(weeks=4) }) reviews = results.fetchall() reviews = [dict(review) for review in reviews] if reviews: for review in reviews: review["rating"] = RATING_SCALE_1_5.get(review["rating"]) review["last_revision"] = { "id": review.pop("latest_revision_id"), "timestamp": review.pop("latest_revision_timestamp"), "text": review["text"], "rating": review["rating"], "review_id": review["id"], } reviews = [to_dict(review, confidential=True) for review in reviews] cache.set(cache_key, reviews, 1 * 60 * 60, REVIEW_CACHE_NAMESPACE) # 1 hour shuffle(reviews) return reviews[:limit]
def test_expire(self): cache.set("a", 1, time=100) self.assertEqual(cache.expire("a", 1), True) sleep(1.1) self.assertEqual(cache.get("a"), None)
def add_legacy_listens_to_queue(self): """Fetch more legacy listens from the listens table by doing an left join on the matched listens, finding the next chunk of legacy listens to look up. Listens are added to the queue with a low priority.""" # Find listens that have no entry in the mapping yet. legacy_query = """SELECT data->'track_metadata'->'additional_info'->>'recording_msid'::TEXT AS recording_msid FROM listen LEFT JOIN mbid_mapping m ON data->'track_metadata'->'additional_info'->>'recording_msid' = m.recording_msid::text WHERE m.recording_msid IS NULL AND listened_at <= :max_ts AND listened_at > :min_ts""" # Find mapping rows that need to be rechecked recheck_query = """SELECT recording_msid FROM mbid_mapping WHERE last_updated = '1970-01-01' LIMIT %d""" % RECHECK_BATCH_SIZE # Check to see where we need to pick up from, or start new if not self.legacy_listens_index_date: dt = cache.get(LEGACY_LISTENS_INDEX_DATE_CACHE_KEY, decode=False) or b"" try: self.legacy_listens_index_date = int( datetime.datetime.strptime(str(dt, "utf-8"), "%Y-%m-%d").timestamp()) self.app.logger.info("Loaded date index from cache: %d %s" % (self.legacy_listens_index_date, str(dt))) except ValueError: self.legacy_listens_index_date = int( datetime.datetime.now().timestamp()) self.app.logger.info("Use date index now()") # Check to see if we're done if self.legacy_listens_index_date < DATA_START_YEAR_IN_SECONDS - LEGACY_LISTENS_LOAD_WINDOW: self.app.logger.info( "Finished looking up all legacy listens! Wooo!") self.legacy_next_run = monotonic( ) + UNMATCHED_LISTENS_COMPLETED_TIMEOUT self.legacy_listens_index_date = int( datetime.datetime.now().timestamp()) self.num_legacy_listens_loaded = 0 dt = datetime.datetime.fromtimestamp( self.legacy_listens_index_date) cache.set(LEGACY_LISTENS_INDEX_DATE_CACHE_KEY, dt.strftime("%Y-%m-%d"), expirein=0, encode=False) return # Check to see if any listens have been marked for re-check count = self.fetch_and_queue_listens(recheck_query, {}) if count > 0: self.app.logger.info("Loaded %d listens to be rechecked." % count) return else: # If none, check for old legacy listens count = self.fetch_and_queue_listens( legacy_query, { "max_ts": self.legacy_listens_index_date, "min_ts": self.legacy_listens_index_date - LEGACY_LISTENS_LOAD_WINDOW }) self.app.logger.info( "Loaded %s more legacy listens for %s" % (count, datetime.datetime.fromtimestamp( self.legacy_listens_index_date).strftime("%Y-%m-%d"))) # update cache entry and count self.legacy_listens_index_date -= LEGACY_LISTENS_LOAD_WINDOW dt = datetime.datetime.fromtimestamp(self.legacy_listens_index_date) cache.set(LEGACY_LISTENS_INDEX_DATE_CACHE_KEY, dt.strftime("%Y-%m-%d"), expirein=0, encode=False) self.num_legacy_listens_loaded = count
def test_no_init(self): cache._r = None with self.assertRaises(RuntimeError): cache.set("test", "testing") with self.assertRaises(RuntimeError): cache.get("test")
def recalculate_all_user_data(): timescale.init_db_connection(config.SQLALCHEMY_TIMESCALE_URI) db.init_db_connection(config.SQLALCHEMY_DATABASE_URI) init_cache(host=config.REDIS_HOST, port=config.REDIS_PORT, namespace=config.REDIS_NAMESPACE) # Find the created timestamp of the last listen query = "SELECT max(created) FROM listen WHERE created > :date" try: with timescale.engine.connect() as connection: result = connection.execute(sqlalchemy.text(query), date=datetime.now() - timedelta(weeks=4)) row = result.fetchone() last_created_ts = row[0] except psycopg2.OperationalError as e: logger.error("Cannot query ts to fetch latest listen." % str(e), exc_info=True) raise logger.info("Last created timestamp: " + str(last_created_ts)) # Select a list of users user_list = [] query = 'SELECT musicbrainz_id FROM "user"' try: with db.engine.connect() as connection: result = connection.execute(sqlalchemy.text(query)) for row in result: user_list.append(row[0]) except psycopg2.OperationalError as e: logger.error("Cannot query db to fetch user list." % str(e), exc_info=True) raise logger.info("Fetched %d users. Setting empty cache entries." % len(user_list)) # Reset the timestamps and listen counts to 0 for all users for user_name in user_list: cache.set(REDIS_USER_LISTEN_COUNT + user_name, 0, expirein=0, encode=False) cache.set(REDIS_USER_LISTEN_COUNT + user_name, 0, expirein=0, encode=False) cache.set(REDIS_USER_TIMESTAMPS + user_name, "0,0", expirein=0) # Tabulate all of the listen counts/timestamps for all users logger.info("Scan the whole listen table...") listen_counts = defaultdict(int) user_timestamps = {} query = "SELECT listened_at, user_name FROM listen where created <= :ts" try: with timescale.engine.connect() as connection: result = connection.execute(sqlalchemy.text(query), ts=last_created_ts) for row in result: ts = row[0] user_name = row[1] if user_name not in user_timestamps: user_timestamps[user_name] = [ts, ts] else: if ts > user_timestamps[user_name][1]: user_timestamps[user_name][1] = ts if ts < user_timestamps[user_name][0]: user_timestamps[user_name][0] = ts listen_counts[user_name] += 1 except psycopg2.OperationalError as e: logger.error("Cannot query db to fetch user list." % str(e), exc_info=True) raise logger.info("Setting updated cache entries.") # Set the timestamps and listen counts for all users for user_name in user_list: try: cache.increment(REDIS_USER_LISTEN_COUNT + user_name, amount=listen_counts[user_name]) except KeyError: pass try: tss = cache.get(REDIS_USER_TIMESTAMPS + user_name) (min_ts, max_ts) = tss.split(",") min_ts = int(min_ts) max_ts = int(max_ts) if min_ts and min_ts < user_timestamps[user_name][0]: user_timestamps[user_name][0] = min_ts if max_ts and max_ts > user_timestamps[user_name][1]: user_timestamps[user_name][1] = max_ts cache.set( REDIS_USER_TIMESTAMPS + user_name, "%d,%d" % (user_timestamps[user_name][0], user_timestamps[user_name][1]), expirein=0) except KeyError: pass
def test_delete(self): key = "testing" self.assertTrue(cache.set(key, u"Пример")) self.assertEqual(cache.get(key), u"Пример") self.assertEqual(cache.delete(key), 1) self.assertIsNone(cache.get(key))
def review_list_handler(): """Get list of reviews. **Request Example:** .. code-block:: bash $ curl "https://critiquebrainz.org/ws/1/review/?limit=1&offset=50" \\ -X GET **Response Example:** .. code-block:: json { "count": 9197, "limit": 1, "offset": 50, "reviews": [ { "created": "Fri, 16 May 2008 00:00:00 GMT", "edits": 0, "entity_id": "09259937-6477-3959-8b10-af1cbaea8e6e", "entity_type": "release_group", "id": "c807d0b4-0dd0-43fe-a7c4-d29bb61f389e", "language": "en", "last_updated": "Fri, 16 May 2008 00:00:00 GMT", "license": { "full_name": "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported", "id": "CC BY-NC-SA 3.0", "info_url": "https://creativecommons.org/licenses/by-nc-sa/3.0/" }, "popularity": 0, "source": "BBC", "source_url": "http://www.bbc.co.uk/music/reviews/vh54", "text": "TEXT CONTENT OF REVIEW", "rating": 5, "user": { "created": "Wed, 07 May 2014 16:20:47 GMT", "display_name": "Jenny Nelson", "id": "3bf3fe0c-6db2-4746-bcf1-f39912113852", "karma": 0, "user_type": "Noob" }, "votes": { "positive": 0, "negative": 0 } } ] } :json uuid entity_id: UUID of the release group that is being reviewed :json string entity_type: One of the supported reviewable entities. 'release_group' or 'event' etc. **(optional)** :query user_id: user's UUID **(optional)** :query sort: ``popularity`` or ``published_on`` **(optional)** :query limit: results limit, min is 0, max is 50, default is 50 **(optional)** :query offset: result offset, default is 0 **(optional)** :query language: language code (ISO 639-1) **(optional)** :resheader Content-Type: *application/json* """ # TODO: This checking is added to keep old clients working and needs to be removed. release_group = Parser.uuid('uri', 'release_group', optional=True) if release_group: entity_id = release_group entity_type = 'release_group' else: entity_id = Parser.uuid('uri', 'entity_id', optional=True) entity_type = Parser.string('uri', 'entity_type', valid_values=ENTITY_TYPES, optional=True) user_id = Parser.uuid('uri', 'user_id', optional=True) # TODO: "rating" sort value is deprecated and needs to be removed. sort = Parser.string('uri', 'sort', valid_values=['popularity', 'published_on', 'rating'], optional=True) if sort == 'rating': sort = 'popularity' limit = Parser.int('uri', 'limit', min=1, max=50, optional=True) or 50 offset = Parser.int('uri', 'offset', optional=True) or 0 language = Parser.string('uri', 'language', min=2, max=3, optional=True) if language and language not in supported_languages: raise InvalidRequest(desc='Unsupported language') # TODO(roman): Ideally caching logic should live inside the model. Otherwise it # becomes hard to track all this stuff. cache_key = cache.gen_key('list', entity_id, user_id, sort, limit, offset, language) cached_result = cache.get(cache_key, REVIEW_CACHE_NAMESPACE) if cached_result: reviews = cached_result['reviews'] count = cached_result['count'] else: reviews, count = db_review.list_reviews( entity_id=entity_id, entity_type=entity_type, user_id=user_id, sort=sort, limit=limit, offset=offset, language=language, ) reviews = [db_review.to_dict(p) for p in reviews] cache.set(cache_key, { 'reviews': reviews, 'count': count, }, namespace=REVIEW_CACHE_NAMESPACE) return jsonify(limit=limit, offset=offset, count=count, reviews=reviews)
def test_single(self): self.assertTrue(cache.set("test2", "Hello!")) self.assertEqual(cache.get("test2"), "Hello!")
def test_single_fancy(self): self.assertTrue(cache.set("test3", u"Привет!")) self.assertEqual(cache.get("test3"), u"Привет!")
def review_list_handler(): """Get list of reviews. **Request Example:** .. code-block:: bash $ curl "https://critiquebrainz.org/ws/1/review/?limit=1&offset=50" \\ -X GET **Response Example:** .. code-block:: json { "count": 9197, "limit": 1, "offset": 50, "reviews": [ { "created": "Fri, 16 May 2008 00:00:00 GMT", "edits": 0, "entity_id": "09259937-6477-3959-8b10-af1cbaea8e6e", "entity_type": "release_group", "id": "c807d0b4-0dd0-43fe-a7c4-d29bb61f389e", "language": "en", "last_updated": "Fri, 16 May 2008 00:00:00 GMT", "license": { "full_name": "Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported", "id": "CC BY-NC-SA 3.0", "info_url": "https://creativecommons.org/licenses/by-nc-sa/3.0/" }, "popularity": 0, "source": "BBC", "source_url": "http://www.bbc.co.uk/music/reviews/vh54", "text": "TEXT CONTENT OF REVIEW", "rating": 5, "user": { "created": "Wed, 07 May 2014 16:20:47 GMT", "display_name": "Jenny Nelson", "id": "3bf3fe0c-6db2-4746-bcf1-f39912113852", "karma": 0, "user_type": "Noob" }, "votes": { "positive": 0, "negative": 0 } } ] } :json uuid entity_id: UUID of the release group that is being reviewed :json string entity_type: One of the supported reviewable entities. 'release_group' or 'event' etc. **(optional)** :query user_id: user's UUID **(optional)** :query sort: ``popularity`` or ``published_on`` **(optional)** :query limit: results limit, min is 0, max is 50, default is 50 **(optional)** :query offset: result offset, default is 0 **(optional)** :query language: language code (ISO 639-1) **(optional)** :resheader Content-Type: *application/json* """ # TODO: This checking is added to keep old clients working and needs to be removed. release_group = Parser.uuid('uri', 'release_group', optional=True) if release_group: entity_id = release_group entity_type = 'release_group' else: entity_id = Parser.uuid('uri', 'entity_id', optional=True) entity_type = Parser.string('uri', 'entity_type', valid_values=ENTITY_TYPES, optional=True) user_id = Parser.uuid('uri', 'user_id', optional=True) sort = Parser.string( 'uri', 'sort', valid_values=['popularity', 'published_on', 'rating', 'created'], optional=True) # "rating" and "created" sort values are deprecated and but allowed here for backward compatibility if sort == 'created': sort = 'published_on' if sort == 'rating': sort = 'popularity' limit = Parser.int('uri', 'limit', min=1, max=50, optional=True) or 50 offset = Parser.int('uri', 'offset', optional=True) or 0 language = Parser.string('uri', 'language', min=2, max=3, optional=True) if language and language not in supported_languages: raise InvalidRequest(desc='Unsupported language') # TODO(roman): Ideally caching logic should live inside the model. Otherwise it # becomes hard to track all this stuff. cache_key = cache.gen_key('list', entity_id, user_id, sort, limit, offset, language) cached_result = cache.get(cache_key, REVIEW_CACHE_NAMESPACE) if cached_result: reviews = cached_result['reviews'] count = cached_result['count'] else: reviews, count = db_review.list_reviews( entity_id=entity_id, entity_type=entity_type, user_id=user_id, sort=sort, limit=limit, offset=offset, language=language, ) reviews = [db_review.to_dict(p) for p in reviews] cache.set(cache_key, { 'reviews': reviews, 'count': count, }, namespace=REVIEW_CACHE_NAMESPACE) return jsonify(limit=limit, offset=offset, count=count, reviews=reviews)
def test_expireat(self): cache.set("a", 1, time=100) self.assertEqual(cache.expireat("a", int(time() + 1)), True) sleep(1.1) self.assertEqual(cache.get("a"), None)
def get_popular(limit=None): """Get a list of popular reviews. Popularity is determined by 'popularity' of a particular review. popularity is a difference between positive votes and negative. In this case only votes from the last month are used to calculate popularity to make results more varied. Args: limit (int): Maximum number of reviews to return. Returns: Randomized list of popular reviews which are converted into dictionaries using to_dict method. """ cache_key = cache.gen_key("popular_reviews", limit) reviews = cache.get(cache_key, REVIEW_CACHE_NAMESPACE) defined_limit = 4 * limit if limit else None if not reviews: with db.engine.connect() as connection: results = connection.execute( sqlalchemy.text(""" SELECT review.id, review.entity_id, review.entity_type, review.user_id, review.edits, review.is_draft, review.is_hidden, review.license_id, review.language, review.source, review.source_url, SUM( CASE WHEN vote = 't' THEN 1 WHEN vote = 'f' THEN -1 WHEN vote IS NULL THEN 0 END ) AS popularity, latest_revision.id AS latest_revision_id, latest_revision.timestamp AS latest_revision_timestamp, latest_revision.text AS text, latest_revision.rating AS rating FROM review JOIN revision ON revision.review_id = review.id LEFT JOIN ( SELECT revision_id, vote FROM vote WHERE rated_at > :last_month ) AS votes_last_month ON votes_last_month.revision_id = revision.id JOIN ( revision JOIN ( SELECT review.id AS review_uuid, MAX(timestamp) AS latest_timestamp FROM review JOIN revision ON review.id = review_id GROUP BY review.id ) AS latest ON latest.review_uuid = revision.review_id AND latest.latest_timestamp = revision.timestamp ) AS latest_revision ON review.id = latest_revision.review_id WHERE entity_id IN ( SELECT DISTINCT entity_id FROM ( SELECT entity_id FROM review ORDER BY RANDOM() ) AS randomized_entity_ids ) AND latest_revision.text IS NOT NULL GROUP BY review.id, latest_revision.id ORDER BY popularity LIMIT :limit """), { "limit": defined_limit, "last_month": datetime.now() - timedelta(weeks=4) }) reviews = results.fetchall() reviews = [dict(review) for review in reviews] if reviews: for review in reviews: review["rating"] = RATING_SCALE_1_5.get(review["rating"]) review["last_revision"] = { "id": review.pop("latest_revision_id"), "timestamp": review.pop("latest_revision_timestamp"), "text": review["text"], "rating": review["rating"], "review_id": review["id"], } reviews = [ to_dict(review, confidential=True) for review in reviews ] cache.set(cache_key, reviews, 1 * 60 * 60, REVIEW_CACHE_NAMESPACE) # 1 hour shuffle(reviews) return reviews[:limit]