def __init__(self, api, collector, n_statuses, time_limit=None, data_limit=None, json_path="./streaming.json", backup_path=None, filter_user=lambda x: True, filter_status=lambda x: True, attempts=None, backup=None, verbose=True): """ Streamer constructor, it represents an offline streamer, store streaming data into a file :param api: tweepy API obj :param collector, Collector obj used for collecting data streamed :param n_statuses: number of statuses to collect :param time_limit: duration of the streaming, if None don't consider so it will last until process interrupt :param data_limit: number of data to collect at most (streaming data), if None don't consider :param json_path: file's location where saving the data collected, if None print on the std output :param backup_path: backup file's path :param filter_user: user filter function: User --> Bool :param filter_status: status filter function: Status --> Bool :param attempts: number of reconnection attempts to perform in case of streaming failure, first connection excluded, if None always retry to reconnect :param backup: every how many seconds to backup, if None no backup is scheduled :param verbose: verbosity """ super(OnlineStreamer, self).__init__() self.api = api self.time_limit = time_limit self.data_limit = data_limit self.json_path = json_path self.backup_path = backup_path if backup_path is not None else "./.backup{}".format( support.get_time()) self.filter_user = filter_user self.filter_status = filter_status self.n_statuses = n_statuses self.attempts = attempts self.backup = backup self._verbose = verbose # verbosity function self.verboseprint = print if self._verbose else lambda *args: None # collector needed for online data collection self.collector = collector self.start_time = 0 self.last_backup = 0 self.count = 0 self.start_time = support.get_time() self.last_backup = self.start_time self.file = None self._closed = False
def on_data(self, raw_data): """ Called when new data is available """ logging.debug("New data received..") # if enough time was passed stop streaming or enough data was collected if self._closed: # if file has been opened, close it if self.file is not None: self.file.close() self.file = None duration = self.time_limit if self.time_limit is not None else support.get_time( ) - self.start_time self.verboseprint( "Streaming duration = {} seconds".format(duration)) logging.debug("Streaming terminated at {}".format( support.get_date())) logging.debug("Streaming duration = {} seconds".format(duration)) # stop connection to w/ streaming server return False elif self.file is not None: # print the raw data on the file self.file.write(raw_data) self.file.write("\n") # call on_data of the superclass super(OnlineStreamer, self).on_data(raw_data=raw_data)
def on_status(self, status): """ called when raw data is received from stream """ if self.check_backup(): self.last_backup = support.get_time() self.collector.save_dataset(path=self.backup_path) self.collector.process(screen_name=status.user.screen_name, filter_account=self.filter_user, filter_status=self.filter_status, n_statuses=self.n_statuses) self.count += 1 if (self.data_limit is not None and self.count > self.data_limit) or \ (self.time_limit is not None and (support.get_time() - self.start_time) > self.time_limit): self._closed = True
def check_backup(self): """ Checks whether is time to backup data """ return self.backup is not None and (support.get_time() - self.last_backup) > self.backup
"favourites_count": get_attribute, "statuses_count": get_attribute, "created_at": lambda user, feature_name: get_timestamp(get_attribute(user, feature_name)), "geo_enabled": get_attribute, "lang": get_attribute, "contributors_enabled": get_attribute, "profile_background_color": get_attribute, "profile_background_image_url_https": get_attribute, "profile_background_tile": get_attribute, "profile_image_url_https": get_attribute, "profile_link_color": get_attribute, "profile_text_color": get_attribute, "profile_use_background_image": get_attribute, "default_profile": get_attribute, "default_profile_image": get_attribute, "profile_crawled": lambda x, y: get_time(), "is_suspended": lambda x, y: 0, "following_followers_ratio": lambda user, _: user.friends_count / user.followers_count if user.followers_count != 0 else None, "followers_following_ratio": lambda user, _: user.followers_count / user.friends_count if user.friends_count != 0 else None} default_account_timeline_features = {"n_statuses_collected": lambda statuses_data, _: statuses_data.shape[0], "mean_status_length": lambda statuses_data, _: statuses_data["text_length"].mean(), "media_shared_urls": lambda statuses_data, _: reduce(lambda x,y: x+y, [], [x for x in statuses_data["media_urls"] if x is not None]), "mean_shared_media": lambda statuses_data, _: (len(reduce(lambda x,y: x+y, [], [x for x in statuses_data["media_urls"] if x is not None])) / statuses_data.shape[0]) if statuses_data.shape[0] != 0 else None, "quoted_user_ids": lambda statuses_data, _: [x for x in statuses_data["quoted_user_id"] if x is not None], "replied_status_ids": lambda statuses_data, _: [x for x in statuses_data["in_reply_to_status_id"] if x is not None], "replied_user_ids": lambda statuses_data, _: [x for x in statuses_data["in_reply_to_user_id"] if x is not None], "retweeted_status_ids": lambda statuses_data, _: [x for x in statuses_data["retweeted_status"] if x is not None], "retweeted_user_ids": lambda statuses_data, _: [x for x in statuses_data["retweeted_user_id"] if x is not None]}