def get_profiles(self, ids=None, screen_names=None): # TODO Deal with timeouts if ids: lookup = [] remain = len(ids) for id_ in ids: lookup.append(id_) if len(lookup) >= 100: # limit 100 profiles per request self._stream_profiles(ids=lookup, by_id=True) remain -= len(lookup) l.INFO("Fetching {lookup} profiles. {remain} remain".format( lookup, remain)) l.INFO("Fetching remaining %s profile(s)" % (len(lookup))) self._stream_profiles(lookup, by_id=True) if screen_names: lookup = [] remain = len(screen_names) for sn in screen_names: lookup.append(sn) if len(lookup) >= 100: # limit 100 profiles per request self._stream_profiles(lookup, by_id=False) remain -= len(lookup) l.INFO("Fetching {lookup} profiles. {remain} remain".format( lookup, remain)) l.INFO("Fetching remaining %s profile(s)" % (len(lookup))) self._stream_profiles(lookup, by_id=False)
def _stream_tweets_by_user_id(self, id_, **kwargs): # TODO rework this to use min/max tweets instead of assuming < 200 # means done kwargs = dict( user_id=id_, count=200 ) # TODO consider breaking up/refactoring while True: try: l.INFO("Fetching 200 tweets %s" % (kwargs)) tweets = self.api.GetUserTimeline(**kwargs) except Exception as e: l.WARN("%s kwargs %s" % (e, kwargs)) return None l.INFO("Streaming tweets") for tweet in tweets: self.on_tweet(tweet) if len(tweets) < 200: # TODO Fix - Using <200 as proxy for end of user timeline l.INFO("Stream ended < 200 tweets") break tweet_ids = [tweet.id for tweet in tweets] if len(tweet_ids) > 0: # Next request start at oldest tweet in current request l.INFO("Setting max ID: {}".format(min(tweet_ids))) kwargs['max_id'] = min(tweet_ids)
def get_tweets(self, ids=None, screen_names=None, limit=3200): if ids: for id_ in ids: l.INFO("Gathering tweets for user ID {}".format(id_)) self._stream_tweets(id_=id_, limit=limit) if screen_names: for screen_name in screen_names: l.INFO("Gathering tweets for user {}".format(screen_name)) self._stream_tweets(screen_name=screen_name, limit=limit)
def _stream_friends_by_screen_name(self, screen_name, request_limit=3): kwargs = dict( screen_name=screen_name, cursor=-1, total_count=request_limit * 5000 ) l.INFO("Getting friends %s" % (kwargs)) friends = self.api.GetFriendIDs(**kwargs) l.INFO("Streaming connections %s friends found" % (len(friends))) for friend in friends: self.on_connection(user_id, friend, type_=friend) return friends
def _stream_followers_by_id(self, user_id, request_limit): kwargs = dict( user_id=user_id, cursor=-1, total_count=request_limit * 5000 ) l.INFO("Getting friends %s" % (kwargs)) followers = self.api.GetFollowerIDs(**kwargs) l.INFO("Streaming connections %s followers found" % (len(followers))) for follower in followers: self.on_connection(user_id, follower, type_=follower) return followers
def _fetch_new(self, active_threads): """ Adds new threads to thread cache :param active_threads: list of currently active threads :return: int number threads added """ active_thread_ids = [t.id for t in self.thread_cache] processed = 0 for thread in active_threads: if thread not in active_thread_ids: self.thread_cache.append(self._fetch_one(thread)) l.INFO("{} added to thread cache".format(thread)) processed += 1 l.INFO("Processed {} new threads".format(processed)) return processed
def _poll_thread(self, thread): """Polls 4chan thread for updates or archival""" time.sleep(self.sleep_per_request) update = thread.update() if update: l.INFO("{} has {} new updates".format(thread.id, update)) self.on_update(thread) else: l.INFO("{} no updates".format(thread.id)) if thread.archived: self.thread_cache.remove(thread) self.on_archive(thread) l.INFO("{} has been archived".format(thread.id))
def get_tweets(self, ids=None, screen_names=None, limit=3200): if ids is None: ids = [] if not screen_names is None: ids += self._screen_names_to_ids(screen_names) l.INFO("Getting tweets for ids: %s" % (ids)) for id_ in ids: self._stream_tweets_by_user_id(id_, limit=limit)
def _fetch_one(self, thread_id): """ Get a new thread :param thread_id: int single id of thread to initiate :return: thread object """ time.sleep(self.sleep_per_request) thread = self.board.get_thread(thread_id) l.INFO("Fetching thread ID {}".format(thread_id)) return thread
def _stream_followers(self, user_id=None, screen_name=None, request_limit=3): kwargs = dict( cursor=-1, total_count=request_limit * 5000 ) if user_id: kwargs['user_id'] = user_id if screen_name: kwargs['screen_name'] = screen_name # User ID needed for connection object user_id = self._fetch_profile_by_screen_name(screen_name=[screen_name])[0].id l.INFO("Getting followers %s" % (kwargs)) followers = self.api.GetFollowerIDs(**kwargs) l.INFO("Streaming connections %s followers found" % (len(followers))) for follower in followers: self.on_connection(user_id, follower, type_='follower')
def cli(ctx, config): s3_config = {} try: with open(config, 'r') as config_file: s3_config = json.load(config_file) l.INFO("Using custom CSV configuration: %s" % (s3_config)) except TypeError: l.WARN("Unable to parse s3 config") input_ = click.get_text_stream('stdin') convert(input_, configuration=s3_config)
def get_profiles(self, ids=None, stream=True): # TODO profiles by screen_name # TODO Deal with timeouts lookup = [] remain = len(ids) profiles = [] for id_ in ids: lookup.append(id_) if len(lookup) >= 100: # limit 100 profiles per request chunk = self._fetch_users_by_id(ids=lookup, stream=stream) remain -= len(lookup) l.INFO(""" Fetching {lookup} profiles. {remain} remain """.format(lookup, remain)) profiles += chunk profiles += self._fetch_users_by_id(ids=lookup, stream=stream) l.INFO("Fetching remaining %s profile(s)" % (len(lookup))) return profiles
def cli(ctx, config): csv_config = {} try: with open(config, 'r') as config_file: csv_config = json.load(config_file) l.INFO("Using custom CSV configuration: %s" % (csv_config)) except TypeError: l.WARN("Using default CSV configuration: %s" % (CSV_DEFAULT_CONFIG)) input_ = click.get_text_stream('stdin') convert(input_, configuration=csv_config)
def follow(self): """ Build a thread cache of active threads. Loop over threads until they are archived Ends on first loop after stop_timer limit is hit. If stop_timer=false follow runs indefinitely """ self.start = datetime.datetime.utcnow() self.thread_cache = self.board.get_all_threads() l.INFO("Thread cache initialized {} active threads".format( len(self.thread_cache))) l.INFO("Running for {} minutes".format(self.stop_timer)) active_threads = self.board.get_all_thread_ids() while not self._time_expired() and self.stop_timer: self.loop_start = datetime.datetime.utcnow() self._fetch_new(active_threads) self.update() l.INFO("Thread cache loop complete time elapsed: {}".format( datetime.datetime.utcnow() - self.loop_start)) time.sleep(self.sleep_per_loop) l.INFO("Sleeping {} seconds before restart".format( self.sleep_per_loop)) end = datetime.datetime.utcnow() elapsed = end - self.start l.INFO("Stopping /{} collection".format(self.board.name.upper())) l.INFO("Time Elapsed {}".format(elapsed)) return
def _stream_tweets(self, user_id=None, screen_name=None, limit=3200): # TODO rework this to use min/max tweets instead of assuming < 200 # means done kwargs = dict( count=200 ) tweets_gathered = 0 while True: try: l.INFO("Fetching 200 tweets %s" % (kwargs)) tweets = self.api.GetUserTimeline(**kwargs) tweets_gathered += len(tweets) except Exception as e: l.WARN("%s kwargs %s" % (e, kwargs)) return None l.INFO("Streaming tweets") for tweet in tweets: self.on_tweet(tweet) if tweets_gathered >= limit: l.INFO("Per user limit hit {} tweets gathered".format(limit)) break if len(tweets) < 200: # TODO Fix - Using <200 as proxy for end of user timeline l.INFO("Stream ended < 200 tweets") break tweet_ids = [tweet.id for tweet in tweets] if len(tweet_ids) > 0: # Next request start at oldest tweet in current request l.INFO("Setting max ID: {}".format(min(tweet_ids))) kwargs['max_id'] = min(tweet_ids)
def cli(ctx, users, from_file, from_pipe): collector = GetFriendsLogger() screen_names = [] if not users is None: screen_names = users.split(',') if not from_file is None: reader = csv.reader(from_file) for row in reader: screen_names.append(row[0]) if from_pipe: try: stdin_text = ( click.get_text_stream('stdin').read().strip()).split('\n') for line in stdin_text: screen_names.append(line) except Exception as e: raise RuntimeError("Error while reading pipe: %s" % (e)) l.INFO("Getting user relationship for users: %s" % (screen_names)) collector.get_friends(screen_names=screen_names)
def on_profile(self, profile): l.INFO("PROFILE: %s" % (profile))
def on_tweet(self, tweet): l.INFO(tweet.text.encode('utf-8'))
def main(**kwargs): l.INFO("Starting SMTK")
def on_connection(self, account, connection, type_): """Called when connection is found""" l.INFO("{} found {} with {}".format(type_, account, connection))
def on_profile(self, profile): """Called when profile is found""" l.INFO(profile)
def on_tweet(self, tweet): """ Called when tweet is found""" l.INFO("TWEET FOUND: {}".format(tweet.text))
def google(): l.INFO("Google Command Detected")
def on_tweet(self, tweet): l.INFO("TWEET: %s" % (tweet.text.encode('utf-8')))
def target(): l.INFO("Target Command Detected")
def twitter(): l.INFO("Twitter Command Detected")
def update(self): """Cycle through thread_cache polling for updates""" for thread in self.thread_cache: self._poll_thread(thread) l.INFO("Active threads {}".format(len(self.thread_cache)))
def on_start(self): l.INFO(""" Starting GoogleImageCrawler for keyword: %s """ % (self.keyword)) singer.write_schema(self.stream_name, self.schema, ['image', 'link'])