Ejemplo n.º 1
0
    def check_orphan(self, gid, at_time):
        """ returns true if gid is orphaned """
        #get last requested timestamp
        requested_str = self.rc.hget(S1.cache_key(gid),
                                     S1.cache_requested_key())
        if not requested_str:
            return False

        # compare to orphaned timeout
        if at_time - float(requested_str) < config.DEFAULT_ORPHANED_TIMEOUT:
            return False

        return True
Ejemplo n.º 2
0
    def run(self, *args, **kwargs):
        self.logger.info('Publisher [{0}], starting...'.format(self.name))

        callback = {
            S1.msg_publish(): self._on_publish_updates,
            S1.msg_register(): self._on_register,
            S1.msg_update_avatar(): self._on_update_avatar,
        }

        channels = [S1.publisher_channel_name('all'), S1.publisher_channel_name(self.name)]
        channels.extend([name for name in self.providers.keys() if name != self.name])

        # this will start infinite loop (in Pubsub)
        self.listener(channels, callback)
        self.logger.warning('Publisher [{0}], listener exit!'.format(self.name))
Ejemplo n.º 3
0
    def schedule_next_batch(self, allow_worker_start=False):
        try:
            self.logger.info('[{0}] wake up!'.format(self.name))
            # get the gid set until all processed
            while True:
                at_time = time.time()
                gid_set = self.data.balancer.get_next_poll_set(at_time +
                                                               self.period_s /
                                                               2.0)
                gid_set_len = len(gid_set)
                if not gid_set_len:
                    self.logger.warning('[{0}] Empty gid_set...'.format(
                        self.name))
                    return
                elif allow_worker_start and gid_set_len > self.gid_set_threshold:
                    self.logger.warning(
                        'Gid set count [{0}] above threshold, starting worker...'
                        .format(gid_set_len))
                    self.start_worker()

                self.logger.info(
                    '[{0}] Invoking poll for [{1}] items...'.format(
                        self.name, gid_set_len))

                # clean orphaned gids
                update_set = [
                    gid for gid in gid_set
                    if not self.data.check_orphan(gid, at_time)
                ]

                # post each gid to poller
                for gid in update_set:
                    # move next poll time for the gid to avoid duplicate polling
                    self.data.balancer.add_gid_set(gid,
                                                   at_time + self.gid_poll_s)
                    # post to pollers
                    self.broadcast_command(S1.poller_channel_name('all'),
                                           S1.msg_update(), gid)

                # update stats
                self.update_stats(at_time, len(update_set))

        except Exception as e:
            self.logger.warning('Exception in poller driver: {0}'.format(e))
            self.logger.exception(traceback.format_exc())
            self.data.unregister_poller(self.name)
Ejemplo n.º 4
0
    def run(self, *args, **kwargs):
        self.kwargs = kwargs
        cfg = config.load_config(kwargs['config_path'], 'poller.json')
        self.gid_poll_s = cfg[
            'gid_poll_s'] if 'gid_poll_s' in cfg else self.gid_poll_s
        self.period_s = cfg['period_s'] if 'period_s' in cfg else self.period_s
        self.workers_min = cfg[
            'workers_min'] if 'workers_min' in cfg else self.workers_min
        self.workers_max = cfg[
            'workers_max'] if 'workers_max' in cfg else self.workers_max

        self.logger.info(
            'Poller v[{0}], name=[{1}], poll delay=[{2}]s, period=[{3}]s starting...'
            .format(config.version, self.name, self.gid_poll_s, self.period_s))

        # give pub sub some time... not using syncho notifications...
        time.sleep(1)

        # register self as poller
        self.data.register_poller(self.name)

        # start worker processes
        for n in range(0, self.workers_min):
            self.start_worker()

        # drop message to self to do immediate poll round
        self.broadcast_data(S1.poller_channel_name(self.name), '#')
        # start listening
        self.listener([
            S1.poller_channel_name('all-out'),
            S1.poller_channel_name(self.name)
        ],
                      None,
                      timeout=self.period_s)
        self.logger.warning('Poller master listener exit!')

        # un-register self
        self.data.unregister_poller(self.name)

        # force kill any remaining workers
        while self.workers:
            p = self.workers.popitem()
            self.logger.warning('Terminating remaining poller {0}!'.format(
                p[0]))
            p[1].terminate()
        self.logger.warning('Poller master process exit!')
Ejemplo n.º 5
0
 def _is_key_in_stamp(key, stamp, spread_minutes):
     try:
         minute = int(S1.get_minute_fmt_minute(key))
         stamp_minute = stamp / 60 % 1440
         return (stamp_minute - spread_minutes) <= minute <= (
             stamp_minute + spread_minutes)
     except:
         return False
Ejemplo n.º 6
0
    def __init__(self, logger, name, data, provider_names, config_path, dummy=False):
        """
        @type data: Data
        @type logger: Logger
        @type provider_names: list
        """
        super(Publisher, self).__init__(logger, name, data, provider_names, config_path, dummy)

        self.providers = {S1.publisher_channel_name(p): PublisherProviders.create(p, logger, data, config_path)
                          for p in provider_names}
Ejemplo n.º 7
0
    def dump_gids(self):
        total = 0
        c = self.data.rc.hscan(S1.destination_key_fmt('children'))
        while len(c) > 1 and c[1]:
            total += len(c[1])
            for gid in c[1]:
                self.dump_gid(gid)

            # check if the next cursor is zero
            if c[0] == '0' or c[0] == 0:
                break

            # grab next set
            c = self.data.rc.hscan(S1.destination_key_fmt('children'), c[0])

        # sleep 10 sec before retry
        print('End of gid_set, total [{0}] GIDs.'.format(total))
        self.data_d.rc.delete(S1.register_set())
        print('Cleared register set.')
Ejemplo n.º 8
0
    def dump_source(self, master_gid, gid):
        print('Copying source [{0}:{1}]...'.format(master_gid, gid))

        # add child gid to pollers first
        self.data_d.register_gid(gid)

        # add the gid from the list of child accounts
        print('Linking GID: [{0}]m <-- [{1}]s'.format(master_gid, gid))
        self.data_d.add_linked_account(master_gid, gid)

        destinations = self.data.get_destinations(gid)
        self.log.debug('{"dest": [')
        c = 0
        for destination in destinations:
            users = self.data.get_destination_users(gid, destination)
            for user in users:
                if c != 0:
                    self.log.debug(',')
                # dump destination
                self.dump_destination(master_gid, gid, destination, user)
                c += 1
        self.log.debug('],')

        # dump gid data keys
        self.log.debug('"keys": [')
        self.log.debug('"{0},"'.format(S1.gid_key(gid)))
        self.log.debug('"{0},"'.format(S1.gid_log_key(gid)))
        self.log.debug('"{0},"'.format(S1.links_key(gid)))
        self.log.debug('"{0}"'.format(S1.cache_key(gid)))
        self.log.debug(']}')

        # copy keys
        self.copy_hash(S1.gid_key(gid))
        self.copy_zset(S1.gid_log_key(gid))
        self.copy_hash(S1.cache_key(gid))
        self.copy_set(S1.links_key(gid))

        # copy tokens for all linked destinations (will overwrite some data)
        links = self.data.get_linked_accounts(master_gid) or dict()
        for k in links:
            # copy token
            p = k.split(':')
            if not p[0] in self.data.provider:
                continue

            token = self.data.get_user_token(gid, p[0], p[1])
            self.data_d.set_user_token(gid, p[0], p[1], token)
            # copy user params
            for p_name in S1.PROVIDER_PARAMS:
                p_val = self.data.provider[p[0]].get_user_param(p[1], p_name)
                if p_val:
                    self.data_d.provider[p[0]].set_user_param(p[1], p_name, p_val)
Ejemplo n.º 9
0
 def get_next_poll_set(self, up_to_epoch):
     """
     grabs a range up to current time() from 'all' gid set
     @return: batch of gids to process or None if cursor reset is required
     """
     return self.rc.zrangebyscore(S1.gid_set('all'),
                                  0,
                                  up_to_epoch,
                                  start=0,
                                  num=200,
                                  withscores=False)
Ejemplo n.º 10
0
    def on_terminate(self, *args, **kwargs):
        """
        Called by signal handlers from ServiceBase
        WARNING: This can be called multiple times during process termination!
        """
        self.logger.warning('Poller master is terminating...')
        # stop workers
        while self.stop_worker():
            self.logger.warning('One worker stopped')

        # stop self
        self.send_exit(S1.poller_channel_name(self.name), self_target=True)
        self.logger.warning('Poller master terminate sequence complete!')
Ejemplo n.º 11
0
    def __init__(self,
                 logger,
                 name,
                 data,
                 providers,
                 config_path,
                 dummy=False):
        """
            @type logger: Logger
            @type data: Data
            """
        super(Poller, self).__init__(logger, name, data, providers,
                                     config_path)

        self.kwargs = None
        # default poller driver period
        self.period_s = 2
        # how many gids are allowed to expire in period_s before new worker is launched
        self.gid_set_threshold = 100
        # number of worker processes
        self.workers_min = 3
        # max number of worker process
        self.workers_max = 4
        # default gid poll period, 10 min
        self.gid_poll_s = 600
        # default no poll period, 30 min
        self.gid_no_poll_s = 1800

        self.started_at = time.time()
        self.stats = {
            'hour': (self.started_at, 0),
            'day': (self.started_at, 0),
        }

        self.channel_handler = {
            S1.poller_channel_name('all-out'): self.on_all_out,
            S1.poller_channel_name(self.name): self.on_my_name
        }
Ejemplo n.º 12
0
    def dump_gid(self, gid):
        print('Dumping user, GID: {0}'.format(gid))

        # get child bindings for this account
        children = set(self.data.get_destination_users(gid, 'children'))
        if not children or (len(children) == 1 and gid in children):
            if not self.data.rc.exists(S1.cache_key(gid)):
                print('****** SELF CHILD + NO CACHE, SKIPPED, GID: {0}'.format(gid))
                return

        # just to be safe
        children.add(gid)
        for child in children:
            self.dump_source(gid, child)
Ejemplo n.º 13
0
    def is_cache(self, gid, option):
        #check last requested timestamp
        #get last request timestamp and filter
        requested_str = self.rc.hget(S1.cache_key(gid),
                                     S1.cache_requested_key())
        last_filter = self.rc.hget(S1.cache_key(gid), S1.cache_filter_key())
        #save last request time and filter
        self.rc.hset(S1.cache_key(gid), S1.cache_requested_key(), time.time())
        self.rc.hset(S1.cache_key(gid), S1.cache_filter_key(), option)

        if requested_str:
            if (not last_filter or option == last_filter) and time.time(
            ) - float(requested_str) < config.DEFAULT_FORCE_MISS_TIMEOUT:
                self.logger.warning('Force-request detected [{0}]'.format(gid))
            else:
                # cache hit
                return True
        else:
            self.logger.warning('New user detected [{0}]'.format(gid))

        # cache miss
        return False
Ejemplo n.º 14
0
 def add_gid_set(self, gid, at_epoch):
     self.rc.zadd(S1.gid_set('all'), gid, at_epoch)
Ejemplo n.º 15
0
    def process_activities_doc(self, gid, activities_doc, force=False):
        # validate received data
        updated = GoogleRSS.get_update_timestamp(activities_doc)
        if not updated:
            self.logger.warning(
                'Received empty data set for [{0}]'.format(gid))
            return

        # set last successful poll timestamp
        # users with no posts in Google Plus feeds will not be able to connect
        # as FE monitors this timestamp before accepting new account link
        self.data.cache.set_poll_stamp(gid, time.time())

        # set cache-specific meta-data
        last_updated = self.data.get_destination_update(gid, 'cache', gid)
        self.logger.info(
            'Received data for [{0}], updated [{1}], last_updated [{2}]'.
            format(gid, updated, last_updated))
        if updated < last_updated:
            # Incomplete data?
            self.logger.warning('Warning: Updated timestamp jumped to past!')
            return

        # check if new update is in
        last_etag = self.data.get_destination_param(gid, 'cache', gid,
                                                    S1.etag_key())
        etag = GoogleRSS.get_item_etag(activities_doc)
        if not force and last_etag == etag:
            self.logger.debug('Same data for {0}, last_updated={1}'.format(
                gid, last_updated))
            return

        # save etag
        self.data.set_destination_param(gid, 'cache', gid, S1.etag_key(), etag)

        # set cache destination updated
        self.data.set_destination_update(gid, 'cache', gid, updated)

        # shorten reshares urls
        items = GoogleRSS.get_updated_since(activities_doc, last_updated)
        shorten = self.data.get_gid_shorten_urls(gid)
        urls = set([
            item for item in items
            if shorten or GoogleRSS.get_item_is_share(item)
            for item in GoogleRSS.get_long_urls(item)
        ])
        for url in urls:
            u = self.data.cache.get_short_url(url)
            if not u:
                u = self.shortener.get_short_url(url)
                self.data.cache.cache_short_url(url, u)

        # store the dataset
        self.data.cache.cache_activities_doc(gid, activities_doc)

        # notify publishers
        self.data.flush_updates(gid)

        # process stats data
        # new user ?
        if not last_updated:
            self.logger.warning(
                'Building new user activity map for {0}'.format(gid))
            self._build_user_activity_map(gid, activities_doc)
            # fake an update now as user is likely online when this code is executed
            self.data.cache.incr_num_minute_updates(gid, time.time())
        elif last_updated < updated:
            # increment update count for this minute
            self.logger.debug(
                'Updating user activity map for {0}, data updated={1}'.format(
                    gid, updated))
            self._build_user_activity_map(gid,
                                          activities_doc,
                                          last_updated=last_updated)
        else:
            self.logger.debug(
                'No activity map updates for {0}, data updated={1}'.format(
                    gid, updated))
Ejemplo n.º 16
0
 def root_key(self, user):
     return S1.provider_root_key(self.name, user)
Ejemplo n.º 17
0
 def set_poll_stamp(self, gid, stamp):
     self.rc.hset(S1.cache_key(gid), S1.polled_key(), stamp)
Ejemplo n.º 18
0
 def reset_cache(self, gid):
     self.set_poll_stamp(gid, 0)
     self.rc.hset(S1.cache_key(gid), S1.cache_requested_key(), time.time())
Ejemplo n.º 19
0
 def _stop_worker(self, p):
     self.logger.info('Stopping worker: {0}...'.format(p[0]))
     self.data.unregister_poller(p[0])
     self.send_exit(S1.poller_channel_name(p[0]))
     p[1].join()
Ejemplo n.º 20
0
 def set_gid_max_results(self, gid, max_results):
     return self.rc.hset(S1.cache_key(gid), S1.cache_max_results_key(),
                         max_results)
Ejemplo n.º 21
0
 def get_gid_max_results(self, gid):
     return self.get_hfield(S1.cache_key(gid), S1.cache_max_results_key(),
                            config.DEFAULT_MAX_RESULTS)
Ejemplo n.º 22
0
 def get_short_url(self, url):
     return self.rc.hget(S1.cache_url_key(), url)
Ejemplo n.º 23
0
 def cache_short_url(self, url, short_url):
     return self.rc.hset(S1.cache_url_key(), url, short_url)
Ejemplo n.º 24
0
 def get_num_minute_updates(self, gid, stamp, spread_minutes):
     keys = self.rc.hkeys(S1.cache_key(gid))
     return sum([
         self.hget_int(S1.cache_key(gid), k) for k in keys
         if self._is_key_in_stamp(k, stamp, spread_minutes)
     ])
Ejemplo n.º 25
0
 def incr_num_minute_updates(self, gid, stamp):
     # get the minute of the day
     minute = (stamp / 60) % 1440
     self.rc.hincrby(S1.cache_key(gid), S1.updated_minute_fmt(minute))
Ejemplo n.º 26
0
 def get_next_registered(self):
     return self.rc.spop(S1.register_set())
Ejemplo n.º 27
0
    logging.basicConfig(format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)
    logger.addHandler(
        config.getLogHandler(os.path.join(args.log_path, 'poller_test.log')))
    logger.level = logging.DEBUG

    db = data.Data(logger, args.redis_host, args.redis_port, args.redis_db)

    while True:
        logger.warning(
            'Invoking registration for all, next poll in {0} seconds'.format(
                args.period))
        with open(args.gid_set) as f_set:
            gid_set = [gid.strip() for gid in f_set.readlines()]

        logger.info('Read [{0}] gids'.format(len(gid_set)))
        for n in range(0, len(gid_set)):
            gid = gid_set[randint(0, len(gid_set) - 1)]
            logger.info('Invoking registration for [{0}]'.format(gid))

            db.pubsub.broadcast_command(S1.publisher_channel_name('twitter'),
                                        S1.msg_register(), gid)

            t = randint(5, 20)
            logger.info('Sleeping for [{0}]'.format(t))
            time.sleep(t)
        #get delay and wait
        time.sleep(args.period)
Ejemplo n.º 28
0
 def get_activities(self, gid):
     str_value = self.rc.hget(S1.cache_key(gid), S1.cache_items_key())
     return json.loads(str_value) if str_value else None
Ejemplo n.º 29
0
    parser.add_argument('--period', default=60, type=int)
    args = parser.parse_args()

    logging.basicConfig(format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)
    logger.addHandler(
        config.getLogHandler(os.path.join(args.log_path, 'poller_test.log')))
    logger.level = logging.DEBUG

    data = data.Data(logger, args.redis_host, args.redis_port, args.redis_db)

    while True:
        logger.warning(
            'Invoking poll for all, next poll in {0} seconds'.format(
                args.period))
        with open(args.gid_set) as f_set:
            gid_set = [gid.strip() for gid in f_set.readlines()]

        logger.info('Read [{0}] gids'.format(len(gid_set)))
        for n in range(0, len(gid_set)):
            gid = gid_set[randint(0, len(gid_set) - 1)]
            logger.info('Invoking rebalance for [{0}]'.format(gid))
            data.rc.sadd(S1.register_set(), gid)
            data.register_gid(gid)
            t = randint(5, 20)
            logger.info('Sleeping for [{0}]'.format(t))
            time.sleep(t)
        #get delay and wait
        time.sleep(args.period)
Ejemplo n.º 30
0
 def get_poll_stamp(self, gid):
     polled_str = self.rc.hget(S1.cache_key(gid), S1.polled_key())
     if polled_str:
         return float(polled_str)
     return 0