def check_orphan(self, gid, at_time): """ returns true if gid is orphaned """ #get last requested timestamp requested_str = self.rc.hget(S1.cache_key(gid), S1.cache_requested_key()) if not requested_str: return False # compare to orphaned timeout if at_time - float(requested_str) < config.DEFAULT_ORPHANED_TIMEOUT: return False return True
def run(self, *args, **kwargs): self.logger.info('Publisher [{0}], starting...'.format(self.name)) callback = { S1.msg_publish(): self._on_publish_updates, S1.msg_register(): self._on_register, S1.msg_update_avatar(): self._on_update_avatar, } channels = [S1.publisher_channel_name('all'), S1.publisher_channel_name(self.name)] channels.extend([name for name in self.providers.keys() if name != self.name]) # this will start infinite loop (in Pubsub) self.listener(channels, callback) self.logger.warning('Publisher [{0}], listener exit!'.format(self.name))
def schedule_next_batch(self, allow_worker_start=False): try: self.logger.info('[{0}] wake up!'.format(self.name)) # get the gid set until all processed while True: at_time = time.time() gid_set = self.data.balancer.get_next_poll_set(at_time + self.period_s / 2.0) gid_set_len = len(gid_set) if not gid_set_len: self.logger.warning('[{0}] Empty gid_set...'.format( self.name)) return elif allow_worker_start and gid_set_len > self.gid_set_threshold: self.logger.warning( 'Gid set count [{0}] above threshold, starting worker...' .format(gid_set_len)) self.start_worker() self.logger.info( '[{0}] Invoking poll for [{1}] items...'.format( self.name, gid_set_len)) # clean orphaned gids update_set = [ gid for gid in gid_set if not self.data.check_orphan(gid, at_time) ] # post each gid to poller for gid in update_set: # move next poll time for the gid to avoid duplicate polling self.data.balancer.add_gid_set(gid, at_time + self.gid_poll_s) # post to pollers self.broadcast_command(S1.poller_channel_name('all'), S1.msg_update(), gid) # update stats self.update_stats(at_time, len(update_set)) except Exception as e: self.logger.warning('Exception in poller driver: {0}'.format(e)) self.logger.exception(traceback.format_exc()) self.data.unregister_poller(self.name)
def run(self, *args, **kwargs): self.kwargs = kwargs cfg = config.load_config(kwargs['config_path'], 'poller.json') self.gid_poll_s = cfg[ 'gid_poll_s'] if 'gid_poll_s' in cfg else self.gid_poll_s self.period_s = cfg['period_s'] if 'period_s' in cfg else self.period_s self.workers_min = cfg[ 'workers_min'] if 'workers_min' in cfg else self.workers_min self.workers_max = cfg[ 'workers_max'] if 'workers_max' in cfg else self.workers_max self.logger.info( 'Poller v[{0}], name=[{1}], poll delay=[{2}]s, period=[{3}]s starting...' .format(config.version, self.name, self.gid_poll_s, self.period_s)) # give pub sub some time... not using syncho notifications... time.sleep(1) # register self as poller self.data.register_poller(self.name) # start worker processes for n in range(0, self.workers_min): self.start_worker() # drop message to self to do immediate poll round self.broadcast_data(S1.poller_channel_name(self.name), '#') # start listening self.listener([ S1.poller_channel_name('all-out'), S1.poller_channel_name(self.name) ], None, timeout=self.period_s) self.logger.warning('Poller master listener exit!') # un-register self self.data.unregister_poller(self.name) # force kill any remaining workers while self.workers: p = self.workers.popitem() self.logger.warning('Terminating remaining poller {0}!'.format( p[0])) p[1].terminate() self.logger.warning('Poller master process exit!')
def _is_key_in_stamp(key, stamp, spread_minutes): try: minute = int(S1.get_minute_fmt_minute(key)) stamp_minute = stamp / 60 % 1440 return (stamp_minute - spread_minutes) <= minute <= ( stamp_minute + spread_minutes) except: return False
def __init__(self, logger, name, data, provider_names, config_path, dummy=False): """ @type data: Data @type logger: Logger @type provider_names: list """ super(Publisher, self).__init__(logger, name, data, provider_names, config_path, dummy) self.providers = {S1.publisher_channel_name(p): PublisherProviders.create(p, logger, data, config_path) for p in provider_names}
def dump_gids(self): total = 0 c = self.data.rc.hscan(S1.destination_key_fmt('children')) while len(c) > 1 and c[1]: total += len(c[1]) for gid in c[1]: self.dump_gid(gid) # check if the next cursor is zero if c[0] == '0' or c[0] == 0: break # grab next set c = self.data.rc.hscan(S1.destination_key_fmt('children'), c[0]) # sleep 10 sec before retry print('End of gid_set, total [{0}] GIDs.'.format(total)) self.data_d.rc.delete(S1.register_set()) print('Cleared register set.')
def dump_source(self, master_gid, gid): print('Copying source [{0}:{1}]...'.format(master_gid, gid)) # add child gid to pollers first self.data_d.register_gid(gid) # add the gid from the list of child accounts print('Linking GID: [{0}]m <-- [{1}]s'.format(master_gid, gid)) self.data_d.add_linked_account(master_gid, gid) destinations = self.data.get_destinations(gid) self.log.debug('{"dest": [') c = 0 for destination in destinations: users = self.data.get_destination_users(gid, destination) for user in users: if c != 0: self.log.debug(',') # dump destination self.dump_destination(master_gid, gid, destination, user) c += 1 self.log.debug('],') # dump gid data keys self.log.debug('"keys": [') self.log.debug('"{0},"'.format(S1.gid_key(gid))) self.log.debug('"{0},"'.format(S1.gid_log_key(gid))) self.log.debug('"{0},"'.format(S1.links_key(gid))) self.log.debug('"{0}"'.format(S1.cache_key(gid))) self.log.debug(']}') # copy keys self.copy_hash(S1.gid_key(gid)) self.copy_zset(S1.gid_log_key(gid)) self.copy_hash(S1.cache_key(gid)) self.copy_set(S1.links_key(gid)) # copy tokens for all linked destinations (will overwrite some data) links = self.data.get_linked_accounts(master_gid) or dict() for k in links: # copy token p = k.split(':') if not p[0] in self.data.provider: continue token = self.data.get_user_token(gid, p[0], p[1]) self.data_d.set_user_token(gid, p[0], p[1], token) # copy user params for p_name in S1.PROVIDER_PARAMS: p_val = self.data.provider[p[0]].get_user_param(p[1], p_name) if p_val: self.data_d.provider[p[0]].set_user_param(p[1], p_name, p_val)
def get_next_poll_set(self, up_to_epoch): """ grabs a range up to current time() from 'all' gid set @return: batch of gids to process or None if cursor reset is required """ return self.rc.zrangebyscore(S1.gid_set('all'), 0, up_to_epoch, start=0, num=200, withscores=False)
def on_terminate(self, *args, **kwargs): """ Called by signal handlers from ServiceBase WARNING: This can be called multiple times during process termination! """ self.logger.warning('Poller master is terminating...') # stop workers while self.stop_worker(): self.logger.warning('One worker stopped') # stop self self.send_exit(S1.poller_channel_name(self.name), self_target=True) self.logger.warning('Poller master terminate sequence complete!')
def __init__(self, logger, name, data, providers, config_path, dummy=False): """ @type logger: Logger @type data: Data """ super(Poller, self).__init__(logger, name, data, providers, config_path) self.kwargs = None # default poller driver period self.period_s = 2 # how many gids are allowed to expire in period_s before new worker is launched self.gid_set_threshold = 100 # number of worker processes self.workers_min = 3 # max number of worker process self.workers_max = 4 # default gid poll period, 10 min self.gid_poll_s = 600 # default no poll period, 30 min self.gid_no_poll_s = 1800 self.started_at = time.time() self.stats = { 'hour': (self.started_at, 0), 'day': (self.started_at, 0), } self.channel_handler = { S1.poller_channel_name('all-out'): self.on_all_out, S1.poller_channel_name(self.name): self.on_my_name }
def dump_gid(self, gid): print('Dumping user, GID: {0}'.format(gid)) # get child bindings for this account children = set(self.data.get_destination_users(gid, 'children')) if not children or (len(children) == 1 and gid in children): if not self.data.rc.exists(S1.cache_key(gid)): print('****** SELF CHILD + NO CACHE, SKIPPED, GID: {0}'.format(gid)) return # just to be safe children.add(gid) for child in children: self.dump_source(gid, child)
def is_cache(self, gid, option): #check last requested timestamp #get last request timestamp and filter requested_str = self.rc.hget(S1.cache_key(gid), S1.cache_requested_key()) last_filter = self.rc.hget(S1.cache_key(gid), S1.cache_filter_key()) #save last request time and filter self.rc.hset(S1.cache_key(gid), S1.cache_requested_key(), time.time()) self.rc.hset(S1.cache_key(gid), S1.cache_filter_key(), option) if requested_str: if (not last_filter or option == last_filter) and time.time( ) - float(requested_str) < config.DEFAULT_FORCE_MISS_TIMEOUT: self.logger.warning('Force-request detected [{0}]'.format(gid)) else: # cache hit return True else: self.logger.warning('New user detected [{0}]'.format(gid)) # cache miss return False
def add_gid_set(self, gid, at_epoch): self.rc.zadd(S1.gid_set('all'), gid, at_epoch)
def process_activities_doc(self, gid, activities_doc, force=False): # validate received data updated = GoogleRSS.get_update_timestamp(activities_doc) if not updated: self.logger.warning( 'Received empty data set for [{0}]'.format(gid)) return # set last successful poll timestamp # users with no posts in Google Plus feeds will not be able to connect # as FE monitors this timestamp before accepting new account link self.data.cache.set_poll_stamp(gid, time.time()) # set cache-specific meta-data last_updated = self.data.get_destination_update(gid, 'cache', gid) self.logger.info( 'Received data for [{0}], updated [{1}], last_updated [{2}]'. format(gid, updated, last_updated)) if updated < last_updated: # Incomplete data? self.logger.warning('Warning: Updated timestamp jumped to past!') return # check if new update is in last_etag = self.data.get_destination_param(gid, 'cache', gid, S1.etag_key()) etag = GoogleRSS.get_item_etag(activities_doc) if not force and last_etag == etag: self.logger.debug('Same data for {0}, last_updated={1}'.format( gid, last_updated)) return # save etag self.data.set_destination_param(gid, 'cache', gid, S1.etag_key(), etag) # set cache destination updated self.data.set_destination_update(gid, 'cache', gid, updated) # shorten reshares urls items = GoogleRSS.get_updated_since(activities_doc, last_updated) shorten = self.data.get_gid_shorten_urls(gid) urls = set([ item for item in items if shorten or GoogleRSS.get_item_is_share(item) for item in GoogleRSS.get_long_urls(item) ]) for url in urls: u = self.data.cache.get_short_url(url) if not u: u = self.shortener.get_short_url(url) self.data.cache.cache_short_url(url, u) # store the dataset self.data.cache.cache_activities_doc(gid, activities_doc) # notify publishers self.data.flush_updates(gid) # process stats data # new user ? if not last_updated: self.logger.warning( 'Building new user activity map for {0}'.format(gid)) self._build_user_activity_map(gid, activities_doc) # fake an update now as user is likely online when this code is executed self.data.cache.incr_num_minute_updates(gid, time.time()) elif last_updated < updated: # increment update count for this minute self.logger.debug( 'Updating user activity map for {0}, data updated={1}'.format( gid, updated)) self._build_user_activity_map(gid, activities_doc, last_updated=last_updated) else: self.logger.debug( 'No activity map updates for {0}, data updated={1}'.format( gid, updated))
def root_key(self, user): return S1.provider_root_key(self.name, user)
def set_poll_stamp(self, gid, stamp): self.rc.hset(S1.cache_key(gid), S1.polled_key(), stamp)
def reset_cache(self, gid): self.set_poll_stamp(gid, 0) self.rc.hset(S1.cache_key(gid), S1.cache_requested_key(), time.time())
def _stop_worker(self, p): self.logger.info('Stopping worker: {0}...'.format(p[0])) self.data.unregister_poller(p[0]) self.send_exit(S1.poller_channel_name(p[0])) p[1].join()
def set_gid_max_results(self, gid, max_results): return self.rc.hset(S1.cache_key(gid), S1.cache_max_results_key(), max_results)
def get_gid_max_results(self, gid): return self.get_hfield(S1.cache_key(gid), S1.cache_max_results_key(), config.DEFAULT_MAX_RESULTS)
def get_short_url(self, url): return self.rc.hget(S1.cache_url_key(), url)
def cache_short_url(self, url, short_url): return self.rc.hset(S1.cache_url_key(), url, short_url)
def get_num_minute_updates(self, gid, stamp, spread_minutes): keys = self.rc.hkeys(S1.cache_key(gid)) return sum([ self.hget_int(S1.cache_key(gid), k) for k in keys if self._is_key_in_stamp(k, stamp, spread_minutes) ])
def incr_num_minute_updates(self, gid, stamp): # get the minute of the day minute = (stamp / 60) % 1440 self.rc.hincrby(S1.cache_key(gid), S1.updated_minute_fmt(minute))
def get_next_registered(self): return self.rc.spop(S1.register_set())
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) logger.addHandler( config.getLogHandler(os.path.join(args.log_path, 'poller_test.log'))) logger.level = logging.DEBUG db = data.Data(logger, args.redis_host, args.redis_port, args.redis_db) while True: logger.warning( 'Invoking registration for all, next poll in {0} seconds'.format( args.period)) with open(args.gid_set) as f_set: gid_set = [gid.strip() for gid in f_set.readlines()] logger.info('Read [{0}] gids'.format(len(gid_set))) for n in range(0, len(gid_set)): gid = gid_set[randint(0, len(gid_set) - 1)] logger.info('Invoking registration for [{0}]'.format(gid)) db.pubsub.broadcast_command(S1.publisher_channel_name('twitter'), S1.msg_register(), gid) t = randint(5, 20) logger.info('Sleeping for [{0}]'.format(t)) time.sleep(t) #get delay and wait time.sleep(args.period)
def get_activities(self, gid): str_value = self.rc.hget(S1.cache_key(gid), S1.cache_items_key()) return json.loads(str_value) if str_value else None
parser.add_argument('--period', default=60, type=int) args = parser.parse_args() logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) logger.addHandler( config.getLogHandler(os.path.join(args.log_path, 'poller_test.log'))) logger.level = logging.DEBUG data = data.Data(logger, args.redis_host, args.redis_port, args.redis_db) while True: logger.warning( 'Invoking poll for all, next poll in {0} seconds'.format( args.period)) with open(args.gid_set) as f_set: gid_set = [gid.strip() for gid in f_set.readlines()] logger.info('Read [{0}] gids'.format(len(gid_set))) for n in range(0, len(gid_set)): gid = gid_set[randint(0, len(gid_set) - 1)] logger.info('Invoking rebalance for [{0}]'.format(gid)) data.rc.sadd(S1.register_set(), gid) data.register_gid(gid) t = randint(5, 20) logger.info('Sleeping for [{0}]'.format(t)) time.sleep(t) #get delay and wait time.sleep(args.period)
def get_poll_stamp(self, gid): polled_str = self.rc.hget(S1.cache_key(gid), S1.polled_key()) if polled_str: return float(polled_str) return 0