def process_all_spotify_users():
    """ Get a batch of users to be processed and import their Spotify plays.

    Returns:
        (success, failure) where
            success: the number of users whose plays were successfully imported.
            failure: the number of users for whom we faced errors while importing.
    """

    global _listens_imported_since_start, _metric_submission_time

    service = SpotifyService()
    try:
        users = service.get_active_users_to_process()
    except DatabaseException as e:
        current_app.logger.error('Cannot get list of users due to error %s',
                                 str(e),
                                 exc_info=True)
        return 0, 0

    if not users:
        return 0, 0

    current_app.logger.info('Process %d users...' % len(users))
    success = 0
    failure = 0
    for u in users:
        try:
            _listens_imported_since_start += process_one_user(u, service)
            success += 1
        except ExternalServiceError as e:
            current_app.logger.critical(
                'spotify_reader could not import listens: %s',
                str(e),
                exc_info=True)
            failure += 1
        except Exception as e:
            current_app.logger.critical(
                'spotify_reader could not import listens: %s',
                str(e),
                exc_info=True)
            failure += 1

    if time.monotonic() > _metric_submission_time:
        _metric_submission_time += METRIC_UPDATE_INTERVAL
        metrics.set("spotify_reader",
                    imported_listens=_listens_imported_since_start)

    current_app.logger.info('Processed %d users successfully!', success)
    current_app.logger.info('Encountered errors while processing %d users.',
                            failure)
    return success, failure
Esempio n. 2
0
 def test_set(self, rpush):
     metrics.init('listenbrainz.org')
     os.environ["PRIVATE_IP"] = "127.0.0.1"
     metrics.set("my_metric",
                 timestamp=1619629462352960742,
                 test_i=2,
                 test_fl=.3,
                 test_t=True,
                 test_f=False,
                 test_s="gobble")
     rpush.assert_called_with(
         metrics.REDIS_METRICS_KEY,
         'my_metric,dc=hetzner,server=127.0.0.1,project=listenbrainz.org test_i=2i,test_fl=0.300000,test_t=t,test_f=f,test_s="gobble" 1619629462352960742'
     )
Esempio n. 3
0
    def update_metrics(self, stats):
        """ Calculate stats and print status to stdout and report metrics."""

        if stats["total"] != 0:
            if self.last_processed:
                listens_per_sec = int(
                    (stats["processed"] - self.last_processed) / UPDATE_INTERVAL)
            else:
                listens_per_sec = 0
            self.last_processed = stats["processed"]

            percent = (stats["exact_match"] + stats["high_quality"] + stats["med_quality"] +
                       stats["low_quality"]) / stats["total"] * 100.00
            self.app.logger.info("loaded %d processed %d matched %d not %d legacy: %d queue: %d %d l/s" %
                                 (stats["total"], stats["processed"], stats["exact_match"] + stats["high_quality"] +
                                  stats["med_quality"] +
                                  stats["low_quality"], stats["no_match"],
                                     stats["legacy"], self.queue.qsize(), listens_per_sec))

            metrics.set("listenbrainz-mbid-mapping-writer",
                        total_match_p=percent,
                        exact_match_p=stats["exact_match"] /
                        stats["total"] * 100.00,
                        high_quality_p=stats["high_quality"] /
                        stats["total"] * 100.00,
                        med_quality_p=stats["med_quality"] /
                        stats["total"] * 100.00,
                        low_quality_p=stats["low_quality"] /
                        stats["total"] * 100.00,
                        no_match_p=stats["no_match"] / stats["total"] * 100.00,
                        errors_p=stats["errors"] / stats["total"] * 100.00,
                        total_listens=stats["total"],
                        total_processed=stats["processed"],
                        exact_match=stats["exact_match"],
                        high_quality=stats["high_quality"],
                        med_quality=stats["med_quality"],
                        low_quality=stats["low_quality"],
                        no_match=stats["no_match"],
                        errors=stats["errors"],
                        legacy=stats["legacy"],
                        legacy_match=stats["legacy_match"],
                        qsize=self.queue.qsize(),
                        listens_per_sec=listens_per_sec,
                        legacy_index_date=datetime.date.fromtimestamp(self.legacy_listens_index_date).strftime("%Y-%m-%d"))
Esempio n. 4
0
def main():
    app = listenbrainz.webserver.create_app()
    with app.app_context():
        current_app.logger.info('Spotify Reader started...')
        while True:
            t = time.monotonic()
            success, failure = process_all_spotify_users()
            total_users = success + failure
            if total_users > 0:
                total_time = time.monotonic() - t
                avg_time = total_time / total_users
                metrics.set("spotify_reader",
                            users_processed=total_users,
                            time_to_process_all_users=total_time,
                            time_to_process_one_user=avg_time)
                current_app.logger.info(
                    'All %d users in batch have been processed.', total_users)
                current_app.logger.info(
                    'Total time taken: %.2f s, average time per user: %.2f s.',
                    total_time, avg_time)
            time.sleep(10)
def compare_coverart(mb_query, lb_query, mb_caa_index, lb_caa_index, mb_compare_key, lb_compare_key):
    """ The core cover art comparison function. Given two sets of queries, index values, and 
        comparison keys this function can perform a complete sync as well as an incremental update.

        The queries must fetch chunks of data from the MB and LB tables ordered by
        the corresponding compare key. The starting indexes (the current comparison index
        into the data) must be provided and match the type of the comparison keys. """

    with psycopg2.connect(config.MBID_MAPPING_DATABASE_URI) as mb_conn:
        with mb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as mb_curs:
            with psycopg2.connect(config.SQLALCHEMY_DATABASE_URI) as lb_conn:
                with lb_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as lb_curs:

                    mb_count, lb_count = get_cover_art_counts(mb_curs, lb_curs)
                    log("CAA count: %d\n LB count: %d" % (mb_count, lb_count))

                    threads = []
                    mb_row = None
                    lb_row = None

                    mb_rows = []
                    lb_rows = []

                    mb_done = False
                    lb_done = True if lb_query is None else False

                    extra = 0
                    missing = 0
                    processed = 0

                    while True:
                        if len(mb_rows) == 0 and not mb_done:
                            mb_curs.execute(
                                mb_query, (mb_caa_index, SYNC_BATCH_SIZE))
                            mb_rows = mb_curs.fetchall()
                            if len(mb_rows) == 0:
                                mb_done = True

                        if len(lb_rows) == 0 and not lb_done:
                            lb_curs.execute(
                                lb_query, (lb_caa_index, SYNC_BATCH_SIZE))
                            lb_rows = lb_curs.fetchall()
                            if len(lb_rows) == 0:
                                lb_done = True

                        if not mb_row and len(mb_rows) > 0:
                            mb_row = mb_rows.pop(0)

                        if not lb_row and len(lb_rows) > 0:
                            lb_row = lb_rows.pop(0)

                        if not lb_row and not mb_row:
                            break

                        processed += 1
                        if processed % 100000 == 0:
                            log("processed %d of %d: missing %d extra %d" %
                                  (processed, mb_count, missing, extra))

                        # If the item is in MB, but not in LB, add to LB
                        if lb_row is None or mb_row[mb_compare_key] < lb_row[lb_compare_key]:
                            process_cover_art(threads, mb_row)
                            missing += 1
                            mb_caa_index = mb_row[mb_compare_key]
                            mb_row = None
                            continue

                        # If the item is in LB, but not in MB, remove from LB
                        if mb_row is None or mb_row[mb_compare_key] > lb_row[lb_compare_key]:
                            extra += 1
                            delete_from_lb(lb_row[lb_compare_key])
                            lb_caa_index = lb_row[lb_compare_key]
                            lb_row = None
                            continue

                        # If the caa_id is present in both, skip both
                        if mb_row[mb_compare_key] == lb_row[lb_compare_key]:
                            mb_caa_index = mb_row[mb_compare_key]
                            lb_caa_index = lb_row[lb_compare_key]
                            lb_row = None
                            mb_row = None
                            continue

                        assert False

                    join_threads(threads)
                    log( "Finished! added/skipped %d removed %d from release_color" % (missing, extra))

                    mb_count, lb_count = get_cover_art_counts(mb_curs, lb_curs)
                    log("CAA count: %d\n LB count: %d" % (mb_count, lb_count))

                    metrics.init("listenbrainz")
                    metrics.set("listenbrainz-caa-mapper",
                                caa_front_count=mb_count, lb_caa_count=lb_count)
Esempio n. 6
0
    def insert_to_listenstore(self, data):
        """
        Inserts a batch of listens to the ListenStore. Timescale will report back as
        to which rows were actually inserted into the DB, allowing us to send those
        down the unique queue.

        Args:
            data: the data to be inserted into the ListenStore
            retries: the number of retries to make before deciding that we've failed

        Returns: number of listens successfully sent or LISTEN_INSERT_ERROR_SENTINEL
        if there was an error in inserting listens
        """

        if not data:
            return 0

        self.incoming_listens += len(data)
        try:
            rows_inserted = self.ls.insert(data)
        except psycopg2.OperationalError as err:
            current_app.logger.error(
                "Cannot write data to listenstore: %s. Sleep." % str(err),
                exc_info=True)
            sleep(self.ERROR_RETRY_DELAY)
            return LISTEN_INSERT_ERROR_SENTINEL

        if not rows_inserted:
            return len(data)

        try:
            self.redis_listenstore.increment_listen_count_for_day(
                day=datetime.utcnow(), count=len(rows_inserted))
        except Exception:
            # Not critical, so if this errors out, just log it to Sentry and move forward
            current_app.logger.error(
                "Could not update listen count per day in redis",
                exc_info=True)

        unique = []
        inserted_index = {}
        for inserted in rows_inserted:
            inserted_index['%d-%s-%s' %
                           (inserted[0], inserted[1], inserted[2])] = 1

        for listen in data:
            k = '%d-%s-%s' % (listen.ts_since_epoch, listen.data['track_name'],
                              listen.user_name)
            if k in inserted_index:
                unique.append(listen)

        if not unique:
            return len(data)

        while True:
            try:
                self.unique_ch.basic_publish(
                    exchange=current_app.config['UNIQUE_EXCHANGE'],
                    routing_key='',
                    body=ujson.dumps(unique),
                    properties=pika.BasicProperties(delivery_mode=2, ),
                )
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        self.redis_listenstore.update_recent_listens(unique)
        self.unique_listens += len(unique)

        if monotonic() > self.metric_submission_time:
            self.metric_submission_time += METRIC_UPDATE_INTERVAL
            metrics.set("timescale_writer",
                        incoming_listens=self.incoming_listens,
                        unique_listens=self.unique_listens)
            self.incoming_listens = 0
            self.unique_listens = 0

        return len(data)