Python get_escaped_measurement_name Examples, listenbrainz.utils.get_escaped_measurement_name Python Examples

Example #1

0

Show file

File: influx_listenstore.py Project: Salies/listenbrainz-server

    def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600):
        """ Fetch recent listens for a list of users, given a limit which applies per user. If you 
            have a limit of 3 and 3 users you should get 9 listens if they are available.

            user_list: A list containing the users for which you'd like to retrieve recent listens.
            limit: the maximum number of listens for each user to fetch.
            max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds
        """

        escaped_user_list = []
        for user_name in user_list:
            escaped_user_list.append(get_escaped_measurement_name(user_name))

        query = "SELECT username, * FROM " + ",".join(escaped_user_list)
        query += " WHERE time > " + get_influx_query_timestamp(
            int(time.time()) - max_age)
        query += " ORDER BY time DESC LIMIT " + str(limit)
        try:
            results = self.influx.query(query)
        except Exception as err:
            self.log.error(
                "Cannot query influx while getting listens for users: %s: %s",
                user_list,
                str(err),
                exc_info=True)
            return []

        listens = []
        for user in user_list:
            for result in results.get_points(
                    measurement=get_measurement_name(user)):
                l = Listen.from_influx(result)
                listens.append(l)

        return listens

Example #2

0

Show file

File: influx_listenstore.py Project: Salies/listenbrainz-server

 def get_listens_batch_for_dump(self, username, dump_time, offset):
     # loop until we get this chunk of listens
     while True:
         try:
             return self.influx.query("""
                 SELECT *
                   FROM {measurement}
                  WHERE time <= {timestamp}
               ORDER BY time DESC
                  LIMIT {limit}
                 OFFSET {offset}
             """.format(
                 measurement=get_escaped_measurement_name(username),
                 timestamp=get_influx_query_timestamp(
                     dump_time.strftime('%s')),
                 limit=DUMP_CHUNK_SIZE,
                 offset=offset,
             ))
         except Exception as e:
             self.log.error(
                 'Error while getting listens to dump for user %s: %s',
                 username,
                 str(e),
                 exc_info=True)
             time.sleep(3)

Example #3

0

Show file

File: influx_listenstore.py Project: Uditgulati/listenbrainz-server

    def fetch_listens_from_storage(self, user_name, from_ts, to_ts, limit, order):
        """ The timestamps are stored as UTC in the postgres datebase while on retrieving
            the value they are converted to the local server's timezone. So to compare
            datetime object we need to create a object in the same timezone as the server.

            from_ts: seconds since epoch, in float
            to_ts: seconds since epoch, in float
        """

        # Quote single quote characters which could be used to mount an injection attack.
        # Sadly, influxdb does not provide a means to do this in the client library
        query = 'SELECT * FROM ' + get_escaped_measurement_name(user_name)

        if from_ts is not None:
            query += "WHERE time > " + get_influx_query_timestamp(from_ts)
        else:
            query += "WHERE time < " + get_influx_query_timestamp(to_ts)

        query += " ORDER BY time " + ORDER_TEXT[order] + " LIMIT " + str(limit)
        try:
            results = self.influx.query(query)
        except Exception as err:
            self.log.error("Cannot query influx while getting listens for user: %s: %s", user_name, str(err), exc_info=True)
            return []

        listens = []
        for result in results.get_points(measurement=get_measurement_name(user_name)):
            listens.append(Listen.from_influx(result))

        if order == ORDER_ASC:
            listens.reverse()

        return listens

Example #4

0

Show file

File: influx_listenstore.py Project: shivam-tripathi/listenbrainz-server

    def fetch_listens_from_storage(self, user_name, from_ts, to_ts, limit,
                                   order):
        """ The timestamps are stored as UTC in the postgres datebase while on retrieving
            the value they are converted to the local server's timezone. So to compare
            datetime object we need to create a object in the same timezone as the server.

            from_ts: seconds since epoch, in float
            to_ts: seconds since epoch, in float
        """

        # Quote single quote characters which could be used to mount an injection attack.
        # Sadly, influxdb does not provide a means to do this in the client library
        query = 'SELECT * FROM ' + get_escaped_measurement_name(user_name)

        if from_ts is not None:
            query += "WHERE time > " + get_influx_query_timestamp(from_ts)
        else:
            query += "WHERE time < " + get_influx_query_timestamp(to_ts)

        query += " ORDER BY time " + ORDER_TEXT[order] + " LIMIT " + str(limit)
        try:
            results = self.influx.query(query)
        except Exception as err:
            self.log.error("Cannot query influx: %s" % str(err))
            return []

        listens = []
        for result in results.get_points(
                measurement=get_measurement_name(user_name)):
            listens.append(Listen.from_influx(result))

        if order == ORDER_ASC:
            listens.reverse()

        return listens

Example #5

0

Show file

    def get_incremental_listens_batch(self, username, start_time, end_time,
                                      offset):
        """ Get a batch of listens for an incremental listen dump.

        This uses the `inserted_timestamp` field to get listens.
        """
        while True:
            try:
                return self.influx.query("""
                    SELECT *
                      FROM {measurement}
                     WHERE inserted_timestamp > {start_timestamp}
                       AND inserted_timestamp <= {end_timestamp}
                  ORDER BY time DESC
                     LIMIT {limit}
                    OFFSET {offset}
                """.format(
                    measurement=get_escaped_measurement_name(username),
                    start_timestamp=int(start_time.strftime('%s')),
                    end_timestamp=int(end_time.strftime('%s')),
                    limit=DUMP_CHUNK_SIZE,
                    offset=offset,
                ))
            except Exception as e:
                self.log.error(
                    'Error while getting listens to dump for user %s: %s',
                    username,
                    str(e),
                    exc_info=True)
                raise

Example #6

0

Show file

    def get_listens_batch_for_dump(self, username, end_time, offset):
        """ Get a batch of listens for the full dump.

        This does not query the `inserted_timestamp` field because not all of the listens
        in the production database have them, so for full dumps this query needs to be independent
        of the the `inserted_timestamp` key.
        """
        while True:
            try:
                return self.influx.query("""
                    SELECT *
                      FROM {measurement}
                     WHERE time <= {timestamp}
                  ORDER BY time DESC
                     LIMIT {limit}
                    OFFSET {offset}
                """.format(
                    measurement=get_escaped_measurement_name(username),
                    timestamp=get_influx_query_timestamp(
                        end_time.strftime('%s')),
                    limit=DUMP_CHUNK_SIZE,
                    offset=offset,
                ))
            except Exception as e:
                self.log.error(
                    'Error while getting listens to dump for user %s: %s',
                    username,
                    str(e),
                    exc_info=True)
                time.sleep(3)

Example #7

0

Show file

File: influx_listenstore.py Project: Uditgulati/listenbrainz-server

    def get_timestamps_for_user(self, user_name):
        """ Return the max_ts and min_ts for a given user and cache the result in brainzutils cache
        """

        tss = cache.get(REDIS_USER_TIMESTAMPS % user_name)
        if tss:
            (min_ts, max_ts) = tss.split(",")
            min_ts = int(min_ts)
            max_ts = int(max_ts)
        else:
            query = 'SELECT first(artist_msid) FROM ' + get_escaped_measurement_name(user_name)
            min_ts = self._select_single_timestamp(query, get_measurement_name(user_name))

            query = 'SELECT last(artist_msid) FROM ' + get_escaped_measurement_name(user_name)
            max_ts = self._select_single_timestamp(query, get_measurement_name(user_name))

            cache.set(REDIS_USER_TIMESTAMPS % user_name, "%d,%d" % (min_ts, max_ts), USER_CACHE_TIME)

        return min_ts, max_ts

Example #8

0

Show file

    def condition(self, user_name):
        """ Finds user with bad tracknumbers

        Returns True if user has bad datatype in track numbers, False otherwise
        """
        r = self.ls.query("SHOW FIELD KEYS FROM %s" % get_escaped_measurement_name(user_name))
        for item in r.get_points():
            if item['fieldKey'] == 'tracknumber' and item['fieldType'] != 'integer':
                return True
        return False

Example #9

0

Show file

File: influx_listenstore.py Project: Salies/listenbrainz-server

    def get_timestamps_for_user(self, user_name):
        """ Return the max_ts and min_ts for a given user and cache the result in brainzutils cache
        """

        tss = cache.get(REDIS_USER_TIMESTAMPS % user_name)
        if tss:
            (min_ts, max_ts) = tss.split(",")
            min_ts = int(min_ts)
            max_ts = int(max_ts)
        else:
            query = 'SELECT first(artist_msid) FROM ' + get_escaped_measurement_name(
                user_name)
            min_ts = self._select_single_timestamp(
                query, get_measurement_name(user_name))

            query = 'SELECT last(artist_msid) FROM ' + get_escaped_measurement_name(
                user_name)
            max_ts = self._select_single_timestamp(
                query, get_measurement_name(user_name))

            cache.set(REDIS_USER_TIMESTAMPS % user_name,
                      "%d,%d" % (min_ts, max_ts), USER_CACHE_TIME)

        return min_ts, max_ts

Example #10

0

Show file

File: influx_listenstore.py Project: Salies/listenbrainz-server

    def get_listen_count_for_user(self, user_name, need_exact=False):
        """Get the total number of listens for a user. The number of listens comes from
           brainzutils cache unless an exact number is asked for.

        Args:
            user_name: the user to get listens for
            need_exact: if True, get an exact number of listens directly from the ListenStore
        """

        if not need_exact:
            # check if the user's listen count is already in cache
            # if already present return it directly instead of calculating it again
            # decode is set to False as we have not encoded the value when we set it
            # in brainzutils cache as we need to call increment operation which requires
            # an integer value
            user_key = '{}{}'.format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name)
            count = cache.get(user_key, decode=False)
            if count:
                return int(count)

        try:
            results = self.influx.query(
                'SELECT count(*) FROM ' +
                get_escaped_measurement_name(user_name))
        except (InfluxDBServerError, InfluxDBClientError) as e:
            self.log.error("Cannot query influx: %s" % str(e), exc_info=True)
            raise

        # get the number of listens from the json
        try:
            count = results.get_points(measurement=get_measurement_name(
                user_name)).__next__()['count_recording_msid']
        except (KeyError, StopIteration):
            count = 0

        # put this value into brainzutils cache with an expiry time
        user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name)
        cache.set(user_key,
                  int(count),
                  InfluxListenStore.USER_LISTEN_COUNT_CACHE_TIME,
                  encode=False)
        return int(count)

Example #11

0

Show file

File: influx_listenstore.py Project: Ayush21298/listenbrainz-server

    def get_listen_count_for_user(self, user_name, need_exact=False):
        """Get the total number of listens for a user. The number of listens comes from
           a redis cache unless an exact number is asked for.

        Args:
            user_name: the user to get listens for
            need_exact: if True, get an exact number of listens directly from the ListenStore
        """

        if not need_exact:
            # check if the user's listen count is already in redis
            # if already present return it directly instead of calculating it again
            count = self.redis.get(REDIS_INFLUX_USER_LISTEN_COUNT + user_name)
            if count:
                return int(count)

        try:
            results = self.influx.query(
                'SELECT count(*) FROM ' +
                get_escaped_measurement_name(user_name))
        except (InfluxDBServerError, InfluxDBClientError) as e:
            self.log.error("Cannot query influx: %s" % str(e))
            raise

        # get the number of listens from the json
        try:
            count = results.get_points(measurement=get_measurement_name(
                user_name)).__next__()['count_recording_msid']
        except (KeyError, StopIteration):
            count = 0

        # put this value into redis with an expiry time
        user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name)
        self.redis.setex(user_key, count,
                         InfluxListenStore.USER_LISTEN_COUNT_CACHE_TIME)
        return int(count)

Example #12

0

Show file

File: influx_listenstore.py Project: Uditgulati/listenbrainz-server

    def get_listen_count_for_user(self, user_name, need_exact=False):
        """Get the total number of listens for a user. The number of listens comes from
           brainzutils cache unless an exact number is asked for.

        Args:
            user_name: the user to get listens for
            need_exact: if True, get an exact number of listens directly from the ListenStore
        """

        if not need_exact:
            # check if the user's listen count is already in cache
            # if already present return it directly instead of calculating it again
            # decode is set to False as we have not encoded the value when we set it
            # in brainzutils cache as we need to call increment operation which requires
            # an integer value
            user_key = '{}{}'.format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name)
            count = cache.get(user_key, decode=False)
            if count:
                return int(count)

        try:
            results = self.influx.query('SELECT count(*) FROM ' + get_escaped_measurement_name(user_name))
        except (InfluxDBServerError, InfluxDBClientError) as e:
            self.log.error("Cannot query influx: %s" % str(e), exc_info=True)
            raise

        # get the number of listens from the json
        try:
            count = results.get_points(measurement = get_measurement_name(user_name)).__next__()['count_recording_msid']
        except (KeyError, StopIteration):
            count = 0

        # put this value into brainzutils cache with an expiry time
        user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name)
        cache.set(user_key, int(count), InfluxListenStore.USER_LISTEN_COUNT_CACHE_TIME, encode=False)
        return int(count)

Example #13

0

Show file

File: influx_listenstore.py Project: shivam-tripathi/listenbrainz-server

    def dump_user(self, username, fileobj, dump_time):
        """ Dump specified user's listens into specified file object.

        Args:
            username (str): the MusicBrainz ID of the user whose listens are to be dumped
            fileobj (file): the file into which listens should be written
            dump_time (datetime): the time at which the specific data dump was initiated

        Returns:
            int: the number of bytes this user's listens take in the dump file
        """
        t0 = time.time()
        offset = 0
        bytes_written = 0
        listen_count = 0

        # Get this user's listens in chunks
        while True:
            # loop until we get this chunk of listens
            while True:
                try:
                    result = self.influx.query("""
                        SELECT *
                          FROM {measurement}
                         WHERE time <= {timestamp}
                      ORDER BY time DESC
                         LIMIT {limit}
                        OFFSET {offset}
                    """.format(
                        measurement=get_escaped_measurement_name(username),
                        timestamp=get_influx_query_timestamp(
                            dump_time.strftime('%s')),
                        limit=DUMP_CHUNK_SIZE,
                        offset=offset,
                    ))
                    break
                except Exception as e:
                    self.log.error('Error while getting listens for user %s',
                                   user['musicbrainz_id'])
                    self.log.error(str(e))
                    time.sleep(3)

            rows_added = 0
            for row in result.get_points(get_measurement_name(username)):
                listen = Listen.from_influx(row).to_api()
                try:
                    bytes_written += fileobj.write(ujson.dumps(listen))
                    bytes_written += fileobj.write('\n')
                    rows_added += 1
                except IOError as e:
                    log_ioerrors(self.log, e)
                    raise
                except Exception as e:
                    self.log.error(
                        'Exception while creating json for user: %s',
                        user['musicbrainz_id'])
                    self.log.error(str(e))
                    raise

            listen_count += rows_added
            if not rows_added:
                break

            offset += DUMP_CHUNK_SIZE

        time_taken = time.time() - t0
        self.log.info(
            'Listens for user %s dumped, total %d listens written at %.2f listens / sec!',
            username, listen_count, listen_count / time_taken)

        # the size for this user should not include the last newline we wrote
        # hence return bytes_written - 1 as the size in the dump for this user
        return bytes_written - 1

Example #14

0

Show file

    def dump_listens(self, location, dump_time=datetime.today(), threads=None):
        """ Fetches listens of each user from her measurement and dumps them into a file.
            These files are compressed into an archive.

        Args:
            location: the directory where the listens dump archive should be created
            dump_time (datetime): the time at which the data dump was started
            threads (int): the number of threads to user for compression

        Returns:
            the path to the dump archive
        """

        self.log.info('Beginning dump of listens from InfluxDB...')

        self.log.info(
            'Getting list of users whose listens are to be dumped...')
        users = db_user.get_all_users()
        self.log.info('Total number of users: %d', len(users))

        archive_name = 'listenbrainz-listens-dump-{time}'.format(
            time=dump_time.strftime('%Y%m%d-%H%M%S'))
        archive_path = os.path.join(
            location, '{filename}.tar.xz'.format(filename=archive_name))
        with open(archive_path, 'w') as archive:

            pxz_command = ['pxz', '--compress']
            if threads is not None:
                pxz_command.append('-T {threads}'.format(threads=threads))

            pxz = subprocess.Popen(pxz_command,
                                   stdin=subprocess.PIPE,
                                   stdout=archive)

            with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar:

                temp_dir = tempfile.mkdtemp()

                try:
                    # add timestamp
                    timestamp_path = os.path.join(temp_dir, 'TIMESTAMP')
                    with open(timestamp_path, 'w') as f:
                        f.write(dump_time.isoformat(' '))
                    tar.add(timestamp_path,
                            arcname=os.path.join(archive_name, 'TIMESTAMP'))

                    # add schema version
                    schema_version_path = os.path.join(temp_dir,
                                                       'SCHEMA_SEQUENCE')
                    with open(schema_version_path, 'w') as f:
                        f.write(str(LISTENS_DUMP_SCHEMA_VERSION))
                    tar.add(schema_version_path,
                            arcname=os.path.join(archive_name,
                                                 'SCHEMA_SEQUENCE'))

                    # add copyright notice
                    tar.add(DUMP_LICENSE_FILE_PATH,
                            arcname=os.path.join(archive_name, 'COPYING'))

                except IOError as e:
                    log_ioerrors(self.log, e)
                    raise
                except Exception as e:
                    self.log.error('Exception while adding dump metadata: %s',
                                   str(e))
                    raise

                listens_path = os.path.join(temp_dir, 'listens')
                create_path(listens_path)

                # get listens from all measurements and write them to files in
                # a temporary dir before adding them to the archive
                for user in users:
                    username = user['musicbrainz_id']
                    offset = 0

                    user_listens_file = '{username}.listens'.format(
                        username=username)
                    user_listens_path = os.path.join(listens_path,
                                                     user_listens_file)

                    with open(user_listens_path, 'w') as f:
                        # Get this user's listens in chunks
                        while True:

                            # loop until we get this chunk of listens
                            while True:
                                try:
                                    result = self.influx.query("""
                                        SELECT *
                                          FROM {measurement}
                                         WHERE time <= {timestamp}
                                      ORDER BY time DESC
                                         LIMIT {limit}
                                        OFFSET {offset}
                                    """.format(
                                        measurement=
                                        get_escaped_measurement_name(username),
                                        timestamp=get_influx_query_timestamp(
                                            dump_time.strftime('%s')),
                                        limit=DUMP_CHUNK_SIZE,
                                        offset=offset,
                                    ))
                                    break
                                except Exception as e:
                                    self.log.error(
                                        'Error while getting listens for user %s',
                                        user['musicbrainz_id'])
                                    self.log.error(str(e))
                                    time.sleep(3)

                            rows = list(
                                result.get_points(
                                    get_measurement_name(username)))
                            if not rows:
                                break

                            for row in rows:
                                listen = Listen.from_influx(row).to_api()
                                try:
                                    f.write(ujson.dumps(listen))
                                    f.write('\n')
                                except IOError as e:
                                    log_ioerrors(self.log, e)
                                    raise
                                except Exception as e:
                                    self.log.error(
                                        'Exception while creating json for user: %s',
                                        user['musicbrainz_id'])
                                    self.log.error(str(e))
                                    raise

                            offset += DUMP_CHUNK_SIZE

                # add the listens directory to the archive
                self.log.info('Got all listens, adding them to the archive...')
                tar.add(listens_path,
                        arcname=os.path.join(archive_name, 'listens'))

                # remove the temporary directory
                shutil.rmtree(temp_dir)

            pxz.stdin.close()

        self.log.info('ListenBrainz listen dump done!')
        self.log.info('Dump present at %s!', archive_path)
        return archive_path

Example #15

0

Show file

File: influx_listenstore.py Project: Uditgulati/listenbrainz-server

    def dump_user(self, username, fileobj, dump_time):
        """ Dump specified user's listens into specified file object.

        Args:
            username (str): the MusicBrainz ID of the user whose listens are to be dumped
            fileobj (file): the file into which listens should be written
            dump_time (datetime): the time at which the specific data dump was initiated

        Returns:
            int: the number of bytes this user's listens take in the dump file
        """
        t0 = time.time()
        offset = 0
        bytes_written = 0
        listen_count = 0

        # Get this user's listens in chunks
        while True:
            # loop until we get this chunk of listens
            while True:
                try:
                    result = self.influx.query("""
                        SELECT *
                          FROM {measurement}
                         WHERE time <= {timestamp}
                      ORDER BY time DESC
                         LIMIT {limit}
                        OFFSET {offset}
                    """.format(
                        measurement=get_escaped_measurement_name(username),
                        timestamp=get_influx_query_timestamp(dump_time.strftime('%s')),
                        limit=DUMP_CHUNK_SIZE,
                        offset=offset,
                    ))
                    break
                except Exception as e:
                    self.log.error('Error while getting listens to dump for user %s: %s', user['musicbrainz_id'], str(e), exc_info=True)
                    time.sleep(3)

            rows_added = 0
            for row in result.get_points(get_measurement_name(username)):
                listen = Listen.from_influx(row).to_api()
                listen['user_name'] = username
                try:
                    bytes_written += fileobj.write(ujson.dumps(listen))
                    bytes_written += fileobj.write('\n')
                    rows_added += 1
                except IOError as e:
                    self.log.critical('IOError while writing listens into file for user %s', username, exc_info=True)
                    raise
                except Exception as e:
                    self.log.error('Exception while creating json for user %s: %s', user['musicbrainz_id'], str(e), exc_info=True)
                    raise

            listen_count += rows_added
            if not rows_added:
                break

            offset += DUMP_CHUNK_SIZE

        time_taken = time.time() - t0
        self.log.info('Listens for user %s dumped, total %d listens written at %.2f listens / sec!',
            username, listen_count, listen_count / time_taken)

        # the size for this user should not include the last newline we wrote
        # hence return bytes_written - 1 as the size in the dump for this user
        return bytes_written - 1

Example #16

0

Show file

    def write(self, listen_dicts):
        submit = []
        unique = []
        duplicate_count = 0
        unique_count = 0

        # Partition the listens on the basis of user names
        # and then store the time range for each user
        users = {}
        for listen in listen_dicts:

            t = int(listen['listened_at'])
            user_name = listen['user_name']

            if user_name not in users:
                users[user_name] = {
                    'min_time': t,
                    'max_time': t,
                    'listens': [listen],
                }
                continue

            if t > users[user_name]['max_time']:
                users[user_name]['max_time'] = t

            if t < users[user_name]['min_time']:
                users[user_name]['min_time'] = t

            users[user_name]['listens'].append(listen)

        # get listens in the time range for each user and
        # remove duplicates on the basis of timestamps
        for user_name in users:

            # get the range of time that we need to get from influx for
            # deduplication of listens
            min_time = users[user_name]['min_time']
            max_time = users[user_name]['max_time']

            query = """SELECT time, recording_msid
                         FROM %s
                        WHERE time >= %s
                          AND time <= %s
                    """ % (get_escaped_measurement_name(user_name), get_influx_query_timestamp(min_time), get_influx_query_timestamp(max_time))

            while True:
                try:
                    results = self.influx.query(query)
                    break
                except Exception as e:
                    self.log.error("Cannot query influx: %s" % str(e))
                    sleep(3)

            # collect all the timestamps for this given time range.

            timestamps = defaultdict(list) # dict of list of listens indexed by timestamp
            for result in results.get_points(measurement=get_measurement_name(user_name)):
                timestamps[convert_to_unix_timestamp(result['time'])].append(result)

            for listen in users[user_name]['listens']:
                # Check if a listen with the same timestamp and recording msid is already present in
                # Influx DB and if it is, mark current listen as duplicate
                t = int(listen['listened_at'])
                recording_msid = listen['recording_msid']
                dup = False

                if t in timestamps:
                    for row in timestamps[t]:
                        if row['recording_msid'] == recording_msid:
                            duplicate_count += 1
                            dup = True
                            break
                    else:
                        # if there are listens with the same timestamp but different
                        # metadata, we add a tag specifically for making sure that
                        # influxdb doesn't drop one of the listens. This value
                        # is monotonically increasing and defaults to 0
                        listen['dedup_tag'] = len(timestamps[t])

                if not dup:
                    unique_count += 1
                    submit.append(Listen.from_json(listen))
                    unique.append(listen)
                    timestamps[t].append({
                        'time': convert_timestamp_to_influx_row_format(t),
                        'recording_msid': recording_msid
                    })

        t0 = time()
        submitted_count = self.insert_to_listenstore(submit)
        self.time += time() - t0

        self.log.error("dups: %d, unique: %d, submitted: %d" % (duplicate_count, unique_count, submitted_count))
        if not unique_count:
            return True

        while True:
            try:
                self.unique_ch.basic_publish(
                    exchange=self.config.UNIQUE_EXCHANGE,
                    routing_key='',
                    body=ujson.dumps(unique),
                    properties=pika.BasicProperties(delivery_mode = 2,),
                )
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return True

Example #17

0

Show file

File: influx-writer.py Project: Ayush21298/listenbrainz-server

    def write(self, listen_dicts):
        submit = []
        unique = []
        duplicate_count = 0
        unique_count = 0

        # Partition the listens on the basis of user names
        # and then store the time range for each user
        users = {}
        for listen in listen_dicts:

            t = int(listen['listened_at'])
            user_name = listen['user_name']

            if user_name not in users:
                users[user_name] = {
                    'min_time': t,
                    'max_time': t,
                    'listens': [listen],
                }
                continue

            if t > users[user_name]['max_time']:
                users[user_name]['max_time'] = t

            if t < users[user_name]['min_time']:
                users[user_name]['min_time'] = t

            users[user_name]['listens'].append(listen)

        # get listens in the time range for each user and
        # remove duplicates on the basis of timestamps
        for user_name in users:

            # get the range of time that we need to get from influx for
            # deduplication of listens
            min_time = users[user_name]['min_time']
            max_time = users[user_name]['max_time']

            # quering for artist name here, since a field must be included in the query.
            query = """SELECT time, artist_name
                         FROM %s
                        WHERE time >= %s
                          AND time <= %s
                    """ % (get_escaped_measurement_name(user_name),
                           get_influx_query_timestamp(min_time),
                           get_influx_query_timestamp(max_time))

            while True:
                try:
                    results = self.influx.query(query)
                    break
                except Exception as e:
                    self.log.error("Cannot query influx: %s" % str(e))
                    sleep(3)

            # collect all the timestamps for this given time range.
            timestamps = {}
            for result in results.get_points(
                    measurement=get_measurement_name(user_name)):
                timestamps[convert_to_unix_timestamp(result['time'])] = 1

            for listen in users[user_name]['listens']:
                # Check if this listen is already present in Influx DB and if it is
                # mark current listen as duplicate
                t = int(listen['listened_at'])
                if t in timestamps:
                    duplicate_count += 1
                    continue
                else:
                    unique_count += 1
                    submit.append(Listen.from_json(listen))
                    unique.append(listen)
                    timestamps[t] = 1

        t0 = time()
        submitted_count = self.insert_to_listenstore(submit)
        self.time += time() - t0

        self.log.error("dups: %d, unique: %d, submitted: %d" %
                       (duplicate_count, unique_count, submitted_count))
        if not unique_count:
            return True

        while True:
            try:
                self.unique_ch.basic_publish(exchange='unique',
                                             routing_key='',
                                             body=ujson.dumps(unique),
                                             properties=pika.BasicProperties(
                                                 delivery_mode=2, ))
                break
            except pika.exceptions.ConnectionClosed:
                self.connect_to_rabbitmq()

        return True