def process_all_spotify_users(): """ Get a batch of users to be processed and import their Spotify plays. Returns: (success, failure) where success: the number of users whose plays were successfully imported. failure: the number of users for whom we faced errors while importing. """ global _listens_imported_since_start, _metric_submission_time service = SpotifyService() try: users = service.get_active_users_to_process() except DatabaseException as e: current_app.logger.error('Cannot get list of users due to error %s', str(e), exc_info=True) return 0, 0 if not users: return 0, 0 current_app.logger.info('Process %d users...' % len(users)) success = 0 failure = 0 for u in users: try: _listens_imported_since_start += process_one_user(u, service) success += 1 except ExternalServiceError as e: current_app.logger.critical( 'spotify_reader could not import listens: %s', str(e), exc_info=True) failure += 1 except Exception as e: current_app.logger.critical( 'spotify_reader could not import listens: %s', str(e), exc_info=True) failure += 1 if time.monotonic() > _metric_submission_time: _metric_submission_time += METRIC_UPDATE_INTERVAL metrics.set("spotify_reader", imported_listens=_listens_imported_since_start) current_app.logger.info('Processed %d users successfully!', success) current_app.logger.info('Encountered errors while processing %d users.', failure) return success, failure
def test_set(self, rpush): metrics.init('listenbrainz.org') os.environ["PRIVATE_IP"] = "127.0.0.1" metrics.set("my_metric", timestamp=1619629462352960742, test_i=2, test_fl=.3, test_t=True, test_f=False, test_s="gobble") rpush.assert_called_with( metrics.REDIS_METRICS_KEY, 'my_metric,dc=hetzner,server=127.0.0.1,project=listenbrainz.org test_i=2i,test_fl=0.300000,test_t=t,test_f=f,test_s="gobble" 1619629462352960742' )
def update_metrics(self, stats): """ Calculate stats and print status to stdout and report metrics.""" if stats["total"] != 0: if self.last_processed: listens_per_sec = int( (stats["processed"] - self.last_processed) / UPDATE_INTERVAL) else: listens_per_sec = 0 self.last_processed = stats["processed"] percent = (stats["exact_match"] + stats["high_quality"] + stats["med_quality"] + stats["low_quality"]) / stats["total"] * 100.00 self.app.logger.info("loaded %d processed %d matched %d not %d legacy: %d queue: %d %d l/s" % (stats["total"], stats["processed"], stats["exact_match"] + stats["high_quality"] + stats["med_quality"] + stats["low_quality"], stats["no_match"], stats["legacy"], self.queue.qsize(), listens_per_sec)) metrics.set("listenbrainz-mbid-mapping-writer", total_match_p=percent, exact_match_p=stats["exact_match"] / stats["total"] * 100.00, high_quality_p=stats["high_quality"] / stats["total"] * 100.00, med_quality_p=stats["med_quality"] / stats["total"] * 100.00, low_quality_p=stats["low_quality"] / stats["total"] * 100.00, no_match_p=stats["no_match"] / stats["total"] * 100.00, errors_p=stats["errors"] / stats["total"] * 100.00, total_listens=stats["total"], total_processed=stats["processed"], exact_match=stats["exact_match"], high_quality=stats["high_quality"], med_quality=stats["med_quality"], low_quality=stats["low_quality"], no_match=stats["no_match"], errors=stats["errors"], legacy=stats["legacy"], legacy_match=stats["legacy_match"], qsize=self.queue.qsize(), listens_per_sec=listens_per_sec, legacy_index_date=datetime.date.fromtimestamp(self.legacy_listens_index_date).strftime("%Y-%m-%d"))
def main(): app = listenbrainz.webserver.create_app() with app.app_context(): current_app.logger.info('Spotify Reader started...') while True: t = time.monotonic() success, failure = process_all_spotify_users() total_users = success + failure if total_users > 0: total_time = time.monotonic() - t avg_time = total_time / total_users metrics.set("spotify_reader", users_processed=total_users, time_to_process_all_users=total_time, time_to_process_one_user=avg_time) current_app.logger.info( 'All %d users in batch have been processed.', total_users) current_app.logger.info( 'Total time taken: %.2f s, average time per user: %.2f s.', total_time, avg_time) time.sleep(10)
def compare_coverart(mb_query, lb_query, mb_caa_index, lb_caa_index, mb_compare_key, lb_compare_key): """ The core cover art comparison function. Given two sets of queries, index values, and comparison keys this function can perform a complete sync as well as an incremental update. The queries must fetch chunks of data from the MB and LB tables ordered by the corresponding compare key. The starting indexes (the current comparison index into the data) must be provided and match the type of the comparison keys. """ with psycopg2.connect(config.MBID_MAPPING_DATABASE_URI) as mb_conn: with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs: with psycopg2.connect(config.SQLALCHEMY_DATABASE_URI) as lb_conn: with lb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as lb_curs: mb_count, lb_count = get_cover_art_counts(mb_curs, lb_curs) log("CAA count: %d\n LB count: %d" % (mb_count, lb_count)) threads = [] mb_row = None lb_row = None mb_rows = [] lb_rows = [] mb_done = False lb_done = True if lb_query is None else False extra = 0 missing = 0 processed = 0 while True: if len(mb_rows) == 0 and not mb_done: mb_curs.execute( mb_query, (mb_caa_index, SYNC_BATCH_SIZE)) mb_rows = mb_curs.fetchall() if len(mb_rows) == 0: mb_done = True if len(lb_rows) == 0 and not lb_done: lb_curs.execute( lb_query, (lb_caa_index, SYNC_BATCH_SIZE)) lb_rows = lb_curs.fetchall() if len(lb_rows) == 0: lb_done = True if not mb_row and len(mb_rows) > 0: mb_row = mb_rows.pop(0) if not lb_row and len(lb_rows) > 0: lb_row = lb_rows.pop(0) if not lb_row and not mb_row: break processed += 1 if processed % 100000 == 0: log("processed %d of %d: missing %d extra %d" % (processed, mb_count, missing, extra)) # If the item is in MB, but not in LB, add to LB if lb_row is None or mb_row[mb_compare_key] < lb_row[lb_compare_key]: process_cover_art(threads, mb_row) missing += 1 mb_caa_index = mb_row[mb_compare_key] mb_row = None continue # If the item is in LB, but not in MB, remove from LB if mb_row is None or mb_row[mb_compare_key] > lb_row[lb_compare_key]: extra += 1 delete_from_lb(lb_row[lb_compare_key]) lb_caa_index = lb_row[lb_compare_key] lb_row = None continue # If the caa_id is present in both, skip both if mb_row[mb_compare_key] == lb_row[lb_compare_key]: mb_caa_index = mb_row[mb_compare_key] lb_caa_index = lb_row[lb_compare_key] lb_row = None mb_row = None continue assert False join_threads(threads) log( "Finished! added/skipped %d removed %d from release_color" % (missing, extra)) mb_count, lb_count = get_cover_art_counts(mb_curs, lb_curs) log("CAA count: %d\n LB count: %d" % (mb_count, lb_count)) metrics.init("listenbrainz") metrics.set("listenbrainz-caa-mapper", caa_front_count=mb_count, lb_caa_count=lb_count)
def insert_to_listenstore(self, data): """ Inserts a batch of listens to the ListenStore. Timescale will report back as to which rows were actually inserted into the DB, allowing us to send those down the unique queue. Args: data: the data to be inserted into the ListenStore retries: the number of retries to make before deciding that we've failed Returns: number of listens successfully sent or LISTEN_INSERT_ERROR_SENTINEL if there was an error in inserting listens """ if not data: return 0 self.incoming_listens += len(data) try: rows_inserted = self.ls.insert(data) except psycopg2.OperationalError as err: current_app.logger.error( "Cannot write data to listenstore: %s. Sleep." % str(err), exc_info=True) sleep(self.ERROR_RETRY_DELAY) return LISTEN_INSERT_ERROR_SENTINEL if not rows_inserted: return len(data) try: self.redis_listenstore.increment_listen_count_for_day( day=datetime.utcnow(), count=len(rows_inserted)) except Exception: # Not critical, so if this errors out, just log it to Sentry and move forward current_app.logger.error( "Could not update listen count per day in redis", exc_info=True) unique = [] inserted_index = {} for inserted in rows_inserted: inserted_index['%d-%s-%s' % (inserted[0], inserted[1], inserted[2])] = 1 for listen in data: k = '%d-%s-%s' % (listen.ts_since_epoch, listen.data['track_name'], listen.user_name) if k in inserted_index: unique.append(listen) if not unique: return len(data) while True: try: self.unique_ch.basic_publish( exchange=current_app.config['UNIQUE_EXCHANGE'], routing_key='', body=ujson.dumps(unique), properties=pika.BasicProperties(delivery_mode=2, ), ) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() self.redis_listenstore.update_recent_listens(unique) self.unique_listens += len(unique) if monotonic() > self.metric_submission_time: self.metric_submission_time += METRIC_UPDATE_INTERVAL metrics.set("timescale_writer", incoming_listens=self.incoming_listens, unique_listens=self.unique_listens) self.incoming_listens = 0 self.unique_listens = 0 return len(data)