def __init__(self, *args, **kwargs): """ Constructor with apikeys, and output folder * apikeys: apikeys """ logger.info(kwargs) import copy apikeys = copy.copy(kwargs.pop('apikeys', None)) if not apikeys: raise MissingArgs('apikeys is missing') self.apikeys = copy.copy(apikeys) # keep a copy #self.crawler_id = kwargs.pop('crawler_id', None) oauth2 = kwargs.pop( 'oauth2', True ) # default to use oauth2 (application level access, read-only) if oauth2: apikeys.pop('oauth_token') apikeys.pop('oauth_token_secret') twitter = twython.Twython(apikeys['app_key'], apikeys['app_secret'], oauth_version=2) access_token = twitter.obtain_access_token() kwargs['access_token'] = access_token apikeys.pop('app_secret') kwargs.update(apikeys) super(TwitterAPI, self).__init__(*args, **kwargs)
def __init__(self, node_id, crawler_id, apikeys, handlers, redis_config, proxies=None): if (handlers == None): raise MissingArgs("you need a handler to write the data to...") super(UserRelationshipCrawler, self).__init__(node_id, crawler_id, redis_config, handlers) self.apikeys = copy.copy(apikeys) self.tasks = { "TERMINATE": "TERMINATE", "CRAWL_FRIENDS": { "users": "find_all_friends", "ids": "find_all_friend_ids", "network_type": "friends" }, "CRAWL_FOLLOWERS": { "users": "find_all_followers", "ids": "find_all_follower_ids", "network_type": "followers" }, "CRAWL_USER_TIMELINE": "fetch_user_timeline", "CRAWL_TWEET": "fetch_tweet_by_id" } self.node_queue = NodeQueue(self.node_id, redis_config=redis_config) self.client_args = {"timeout": 300} self.proxies = iter(proxies) if proxies else None self.user_api = None self.init_user_api()
def __init__(self, *args, **kwargs): """ Constructor with apikeys, and output folder * apikeys: apikeys """ logger.info(kwargs) import copy apikeys = copy.copy(kwargs.pop('apikeys', None)) output = copy.copy(kwargs.pop('output', None)) if not apikeys: raise MissingArgs('apikeys is missing') if not output: raise MissingArgs('output is missing') self.apikeys = copy.copy(apikeys) # keep a copy self.output = open(output, 'a') self.counter = 0 kwargs.update(apikeys) super(Streamer, self).__init__(*args, **kwargs)
def find_all_friend_ids(self, user_id=None, write_to_handlers=[], cmd_handlers=[], bucket="friend_ids"): if (not user_id): raise MissingArgs("user_id cannot be None") retry_cnt = MAX_RETRY_CNT cursor = -1 while cursor != 0 and retry_cnt > 1: try: friend_ids = self.get_friends_ids(user_id=user_id, cursor=cursor, count=200) for handler in write_to_handlers: handler.append(json.dumps(friend_ids), bucket=bucket, key=user_id) for handler in cmd_handlers: handler.append(json.dumps(friend_ids), bucket=bucket, key=user_id) cursor = int(friend_ids['next_cursor']) logger.debug("find #%d friend_ids... NEXT_CURSOR: %d" % (len(friend_ids["ids"]), cursor)) time.sleep(2) except twython.exceptions.TwythonRateLimitError: self.rate_limit_error_occured('friends', '/friends/ids') except Exception as exc: time.sleep(10) logger.debug("exception: %s" % exc) retry_cnt -= 1 if (retry_cnt == 0): raise MaxRetryReached("max retry reached due to %s" % (exc)) logger.debug("finished find_all_friend_ids for %s..." % (user_id))