def get_total_listen_count(self, cache_value=True):
        """ Returns the total number of listens stored in the ListenStore.
            First checks the brainzutils cache for the value, if not present there
            makes a query to the db and caches it in brainzutils cache.
        """

        if cache_value:
            count = cache.get(
                InfluxListenStore.REDIS_INFLUX_TOTAL_LISTEN_COUNT,
                decode=False)
            if count:
                return int(count)

        try:
            result = self.influx.query(
                """SELECT %s
                                            FROM "%s"
                                        ORDER BY time DESC
                                           LIMIT 1""" %
                (COUNT_MEASUREMENT_NAME, TIMELINE_COUNT_MEASUREMENT))
        except (InfluxDBServerError, InfluxDBClientError) as err:
            self.log.error("Cannot query influx: %s" % str(err), exc_info=True)
            raise

        try:
            item = result.get_points(
                measurement=TIMELINE_COUNT_MEASUREMENT).__next__()
            count = int(item[COUNT_MEASUREMENT_NAME])
            timestamp = convert_to_unix_timestamp(item['time'])
        except (KeyError, ValueError, StopIteration):
            timestamp = 0
            count = 0

        # Now sum counts that have been added in the interval we're interested in
        try:
            result = self.influx.query(
                """SELECT sum(%s) as total
                                            FROM "%s"
                                           WHERE time > %s""" %
                (COUNT_MEASUREMENT_NAME, TEMP_COUNT_MEASUREMENT,
                 get_influx_query_timestamp(timestamp)))
        except (InfluxDBServerError, InfluxDBClientError) as err:
            self.log.error("Cannot query influx: %s" % str(err), exc_info=True)
            raise

        try:
            data = result.get_points(
                measurement=TEMP_COUNT_MEASUREMENT).__next__()
            count += int(data['total'])
        except StopIteration:
            pass

        if cache_value:
            cache.set(
                InfluxListenStore.REDIS_INFLUX_TOTAL_LISTEN_COUNT,
                int(count),
                InfluxListenStore.TOTAL_LISTEN_COUNT_CACHE_TIME,
                encode=False,
            )
        return count
Ejemplo n.º 2
0
    def from_influx(cls, row):
        """ Factory to make Listen objects from an influx row
        """
        def convert_comma_seperated_string_to_list(string):
            if not string:
                return []
            return [val for val in string.split(',')]

        t = convert_to_unix_timestamp(row['time'])

        data = {
            'release_msid':
            row.get('release_msid'),
            'release_mbid':
            row.get('release_mbid'),
            'release_name':
            row.get('release_name'),
            'recording_mbid':
            row.get('recording_mbid'),
            'release_group_mbid':
            row.get('release_group_mbid'),
            'artist_mbids':
            convert_comma_seperated_string_to_list(row.get('artist_mbids',
                                                           '')),
            'tags':
            convert_comma_seperated_string_to_list(row.get('tags', '')),
            'work_mbids':
            convert_comma_seperated_string_to_list(row.get('work_mbids', '')),
            'isrc':
            row.get('isrc'),
            'spotify_id':
            row.get('spotify_id'),
            'tracknumber':
            row.get('tracknumber'),
            'track_mbid':
            row.get('track_mbid'),
        }

        # The influx row can contain many fields that are user-generated.
        # We only need to add those fields which have some value in them to additional_info.
        # Also, we need to make sure that we don't add fields like time, user_name etc. into
        # the additional_info.
        for key, value in row.items():
            if key not in data and \
               key not in ['time', 'user_name', 'recording_msid', 'artist_mbids', 'tags'] and \
               value is not None:
                data[key] = value

        return cls(timestamp=t,
                   user_name=row.get('user_name'),
                   artist_msid=row.get('artist_msid'),
                   recording_msid=row.get('recording_msid'),
                   release_msid=row.get('release_msid'),
                   data={
                       'additional_info': data,
                       'artist_name': row.get('artist_name'),
                       'track_name': row.get('track_name'),
                   })
    def get_total_listen_count(self, cache_value=True):
        """ Returns the total number of listens stored in the ListenStore.
            First checks the brainzutils cache for the value, if not present there
            makes a query to the db and caches it in brainzutils cache.
        """

        if cache_value:
            count = cache.get(InfluxListenStore.REDIS_INFLUX_TOTAL_LISTEN_COUNT, decode=False)
            if count:
                return int(count)

        try:
            result = self.influx.query("""SELECT %s
                                            FROM "%s"
                                        ORDER BY time DESC
                                           LIMIT 1""" % (COUNT_MEASUREMENT_NAME, TIMELINE_COUNT_MEASUREMENT))
        except (InfluxDBServerError, InfluxDBClientError) as err:
            self.log.error("Cannot query influx: %s" % str(err), exc_info=True)
            raise

        try:
            item = result.get_points(measurement=TIMELINE_COUNT_MEASUREMENT).__next__()
            count = int(item[COUNT_MEASUREMENT_NAME])
            timestamp = convert_to_unix_timestamp(item['time'])
        except (KeyError, ValueError, StopIteration):
            timestamp = 0
            count = 0

        # Now sum counts that have been added in the interval we're interested in
        try:
            result = self.influx.query("""SELECT sum(%s) as total
                                            FROM "%s"
                                           WHERE time > %s""" % (COUNT_MEASUREMENT_NAME, TEMP_COUNT_MEASUREMENT, get_influx_query_timestamp(timestamp)))
        except (InfluxDBServerError, InfluxDBClientError) as err:
            self.log.error("Cannot query influx: %s" % str(err), exc_info=True)
            raise

        try:
            data = result.get_points(measurement=TEMP_COUNT_MEASUREMENT).__next__()
            count += int(data['total'])
        except StopIteration:
            pass

        if cache_value:
            cache.set(
                InfluxListenStore.REDIS_INFLUX_TOTAL_LISTEN_COUNT,
                int(count),
                InfluxListenStore.TOTAL_LISTEN_COUNT_CACHE_TIME,
                encode=False,
            )
        return count
Ejemplo n.º 4
0
    def convert_to_influx_insert_format(self, row, measurement):
        data = {
            'measurement': measurement,
            'time': convert_to_unix_timestamp(row['time']),
        }

        data['fields'] = row
        data['fields'].pop('time')

        try:
            dedup_tag = data['fields'].pop('dedup_tag')
            data['tags'] = {'dedup_tag': dedup_tag}
        except KeyError:
            pass  # no dedup tag, don't need to do anything

        return data
Ejemplo n.º 5
0
    def from_influx(cls, row):
        """ Factory to make Listen objects from an influx row
        """
        t = convert_to_unix_timestamp(row['time'])
        mbids = []
        artist_mbids = row.get('artist_mbids')
        if artist_mbids:
            for mbid in artist_mbids.split(','):
                mbids.append(mbid)

        tags = []
        influx_tags = row.get('tags')
        if influx_tags:
            for tag in influx_tags.split(','):
                tags.append(tag)

        data = {
            'artist_mbids': mbids,
            'release_msid': row.get('release_msid'),
            'release_mbid': row.get('release_mbid'),
            'release_name': row.get('release_name'),
            'recording_mbid': row.get('recording_mbid'),
            'tags': tags,
        }

        # The influx row can contain many fields that are user-generated.
        # We only need to add those fields which have some value in them to additional_info.
        # Also, we need to make sure that we don't add fields like time, user_name etc. into
        # the additional_info.
        for key, value in row.items():
            if key not in [
                    'time', 'user_name', 'recording_msid', 'artist_mbids',
                    'tags'
            ] and value is not None:
                data[key] = value

        return cls(timestamp=t,
                   user_name=row.get('user_name'),
                   artist_msid=row.get('artist_msid'),
                   recording_msid=row.get('recording_msid'),
                   release_msid=row.get('release_msid'),
                   data={
                       'additional_info': data,
                       'artist_name': row.get('artist_name'),
                       'track_name': row.get('track_name'),
                   })
Ejemplo n.º 6
0
    def write(self, listen_dicts):
        submit = []
        unique = []
        duplicate_count = 0
        unique_count = 0

        # Partition the listens on the basis of user names
        # and then store the time range for each user
        users = {}
        for listen in listen_dicts:

            t = int(listen['listened_at'])
            user_name = listen['user_name']

            if user_name not in users:
                users[user_name] = {
                    'min_time': t,
                    'max_time': t,
                    'listens': [listen],
                }
                continue

            if t > users[user_name]['max_time']:
                users[user_name]['max_time'] = t

            if t < users[user_name]['min_time']:
                users[user_name]['min_time'] = t

            users[user_name]['listens'].append(listen)

        # get listens in the time range for each user and
        # remove duplicates on the basis of timestamps
        for user_name in users:

            # get the range of time that we need to get from influx for
            # deduplication of listens
            min_time = users[user_name]['min_time']
            max_time = users[user_name]['max_time']

            query = """SELECT time, recording_msid
                         FROM %s
                        WHERE time >= %s
                          AND time <= %s
                    """ % (get_escaped_measurement_name(user_name), get_influx_query_timestamp(min_time), get_influx_query_timestamp(max_time))

            while True:
                try:
                    results = self.influx.query(query)
                    break
                except Exception as e:
                    self.log.error("Cannot query influx: %s" % str(e))
                    sleep(3)

            # collect all the timestamps for this given time range.

            timestamps = defaultdict(list) # dict of list of listens indexed by timestamp
            for result in results.get_points(measurement=get_measurement_name(user_name)):
                timestamps[convert_to_unix_timestamp(result['time'])].append(result)

            for listen in users[user_name]['listens']:
                # Check if a listen with the same timestamp and recording msid is already present in
                # Influx DB and if it is, mark current listen as duplicate
                t = int(listen['listened_at'])
                recording_msid = listen['recording_msid']
                dup = False

                if t in timestamps:
                    for row in timestamps[t]:
                        if row['recording_msid'] == recording_msid:
                            duplicate_count += 1
                            dup = True
                            break
                    else:
                        # if there are listens with the same timestamp but different
                        # metadata, we add a tag specifically for making sure that
                        # influxdb doesn't drop one of the listens. This value
                        # is monotonically increasing and defaults to 0
                        listen['dedup_tag'] = len(timestamps[t])

                if not dup:
                    unique_count += 1
                    submit.append(Listen.from_json(listen))
                    unique.append(listen)
                    timestamps[t].append({
                        'time': convert_timestamp_to_influx_row_format(t),
                        'recording_msid': recording_msid
                    })

        t0 = time()
        submitted_count = self.insert_to_listenstore(submit)
        self.time += time() - t0

        self.log.error("dups: %d, unique: %d, submitted: %d" % (duplicate_count, unique_count, submitted_count))
        if not unique_count:
            return True

        while True:
            try:
                self.unique_ch.basic_publish(
                    exchange=self.config.UNIQUE_EXCHANGE,
                    routing_key='',
                    body=ujson.dumps(unique),
                    properties=pika.BasicProperties(delivery_mode = 2,),
                )
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return True
Ejemplo n.º 7
0
    def from_influx(cls, row):
        """ Factory to make Listen objects from an influx row
        """

        t = convert_to_unix_timestamp(row['time'])

        data = {
            'release_msid':
            row.get('release_msid'),
            'release_mbid':
            row.get('release_mbid'),
            'recording_mbid':
            row.get('recording_mbid'),
            'release_group_mbid':
            row.get('release_group_mbid'),
            'artist_mbids':
            convert_comma_seperated_string_to_list(row.get('artist_mbids',
                                                           '')),
            'tags':
            convert_comma_seperated_string_to_list(row.get('tags', '')),
            'work_mbids':
            convert_comma_seperated_string_to_list(row.get('work_mbids', '')),
            'isrc':
            row.get('isrc'),
            'spotify_id':
            row.get('spotify_id'),
            'tracknumber':
            row.get('tracknumber'),
            'track_mbid':
            row.get('track_mbid'),
        }

        # The influx row can contain many fields that are user-generated.
        # We only need to add those fields which have some value in them to additional_info.
        # Also, we need to make sure that we don't add fields like time, user_name etc. into
        # the additional_info.
        for key, value in row.items():
            if key not in data and key not in Listen.TOP_LEVEL_KEYS + Listen.PRIVATE_KEYS and value is not None:
                try:
                    value = ujson.loads(value)
                    data[key] = value
                    continue
                except (ValueError, TypeError):
                    pass

                # there are some lists in the database that were converted to string
                # via str(list) so they can't be loaded via json.
                # Example: "['Blank & Jones']"
                # However, yaml parses them safely and correctly
                try:
                    value = yaml.safe_load(value)
                    data[key] = value
                    continue
                except (ValueError, yaml.scanner.ScannerError,
                        yaml.parser.ParserError, Exception):
                    pass

                data[key] = value

        return cls(timestamp=t,
                   user_name=row.get('user_name'),
                   artist_msid=row.get('artist_msid'),
                   recording_msid=row.get('recording_msid'),
                   release_msid=row.get('release_msid'),
                   inserted_timestamp=row.get('inserted_timestamp'),
                   data={
                       'additional_info': data,
                       'artist_name': row.get('artist_name'),
                       'track_name': row.get('track_name'),
                       'release_name': row.get('release_name'),
                   })
Ejemplo n.º 8
0
    def from_influx(cls, row):
        """ Factory to make Listen objects from an influx row
        """

        def convert_comma_seperated_string_to_list(string):
            if not string:
                return []
            return [val for val in string.split(',')]

        t = convert_to_unix_timestamp(row['time'])

        data = {
            'release_msid': row.get('release_msid'),
            'release_mbid': row.get('release_mbid'),
            'recording_mbid': row.get('recording_mbid'),
            'release_group_mbid': row.get('release_group_mbid'),
            'artist_mbids': convert_comma_seperated_string_to_list(row.get('artist_mbids', '')),
            'tags': convert_comma_seperated_string_to_list(row.get('tags', '')),
            'work_mbids': convert_comma_seperated_string_to_list(row.get('work_mbids', '')),
            'isrc': row.get('isrc'),
            'spotify_id': row.get('spotify_id'),
            'tracknumber': row.get('tracknumber'),
            'track_mbid': row.get('track_mbid'),
        }

        # The influx row can contain many fields that are user-generated.
        # We only need to add those fields which have some value in them to additional_info.
        # Also, we need to make sure that we don't add fields like time, user_name etc. into
        # the additional_info.
        for key, value in row.items():
            if key not in data and key not in Listen.TOP_LEVEL_KEYS + Listen.PRIVATE_KEYS and value is not None:
                try:
                    value = ujson.loads(value)
                    data[key] = value
                    continue
                except (ValueError, TypeError):
                    pass

                # there are some lists in the database that were converted to string
                # via str(list) so they can't be loaded via json.
                # Example: "['Blank & Jones']"
                # However, yaml parses them safely and correctly
                try:
                    value = yaml.safe_load(value)
                    data[key] = value
                    continue
                except ValueError:
                    pass

                data[key] = value

        return cls(
            timestamp=t,
            user_name=row.get('user_name'),
            artist_msid=row.get('artist_msid'),
            recording_msid=row.get('recording_msid'),
            release_msid=row.get('release_msid'),
            inserted_timestamp=row.get('inserted_timestamp'),
            data={
                'additional_info': data,
                'artist_name': row.get('artist_name'),
                'track_name': row.get('track_name'),
                'release_name': row.get('release_name'),
            }
        )
    def write(self, listen_dicts):
        submit = []
        unique = []
        duplicate_count = 0
        unique_count = 0

        # Partition the listens on the basis of user names
        # and then store the time range for each user
        users = {}
        for listen in listen_dicts:

            t = int(listen['listened_at'])
            user_name = listen['user_name']

            if user_name not in users:
                users[user_name] = {
                    'min_time': t,
                    'max_time': t,
                    'listens': [listen],
                }
                continue

            if t > users[user_name]['max_time']:
                users[user_name]['max_time'] = t

            if t < users[user_name]['min_time']:
                users[user_name]['min_time'] = t

            users[user_name]['listens'].append(listen)

        # get listens in the time range for each user and
        # remove duplicates on the basis of timestamps
        for user_name in users:

            # get the range of time that we need to get from influx for
            # deduplication of listens
            min_time = users[user_name]['min_time']
            max_time = users[user_name]['max_time']

            # quering for artist name here, since a field must be included in the query.
            query = """SELECT time, artist_name
                         FROM %s
                        WHERE time >= %s
                          AND time <= %s
                    """ % (get_escaped_measurement_name(user_name),
                           get_influx_query_timestamp(min_time),
                           get_influx_query_timestamp(max_time))

            while True:
                try:
                    results = self.influx.query(query)
                    break
                except Exception as e:
                    self.log.error("Cannot query influx: %s" % str(e))
                    sleep(3)

            # collect all the timestamps for this given time range.
            timestamps = {}
            for result in results.get_points(
                    measurement=get_measurement_name(user_name)):
                timestamps[convert_to_unix_timestamp(result['time'])] = 1

            for listen in users[user_name]['listens']:
                # Check if this listen is already present in Influx DB and if it is
                # mark current listen as duplicate
                t = int(listen['listened_at'])
                if t in timestamps:
                    duplicate_count += 1
                    continue
                else:
                    unique_count += 1
                    submit.append(Listen.from_json(listen))
                    unique.append(listen)
                    timestamps[t] = 1

        t0 = time()
        submitted_count = self.insert_to_listenstore(submit)
        self.time += time() - t0

        self.log.error("dups: %d, unique: %d, submitted: %d" %
                       (duplicate_count, unique_count, submitted_count))
        if not unique_count:
            return True

        while True:
            try:
                self.unique_ch.basic_publish(exchange='unique',
                                             routing_key='',
                                             body=ujson.dumps(unique),
                                             properties=pika.BasicProperties(
                                                 delivery_mode=2, ))
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return True