Beispiel #1
0
    def __init__(self,
                 api,
                 collector,
                 n_statuses,
                 time_limit=None,
                 data_limit=None,
                 json_path="./streaming.json",
                 backup_path=None,
                 filter_user=lambda x: True,
                 filter_status=lambda x: True,
                 attempts=None,
                 backup=None,
                 verbose=True):
        """
        Streamer constructor, it represents an offline streamer, store streaming data into a file
        :param api: tweepy API obj
        :param collector, Collector obj used for collecting data streamed
        :param n_statuses: number of statuses to collect
        :param time_limit: duration of the streaming, if None don't consider so it will last until process interrupt
        :param data_limit: number of data to collect at most (streaming data), if None don't consider
        :param json_path: file's location where saving the data collected, if None print on the std output
        :param backup_path: backup file's path
        :param filter_user: user filter function: User --> Bool
        :param filter_status: status filter function: Status --> Bool
        :param attempts: number of reconnection attempts to perform in case of streaming failure, first connection
                         excluded, if None always retry to reconnect
        :param backup: every how many seconds to backup, if None no backup is scheduled
        :param verbose: verbosity
        """

        super(OnlineStreamer, self).__init__()
        self.api = api
        self.time_limit = time_limit
        self.data_limit = data_limit
        self.json_path = json_path
        self.backup_path = backup_path if backup_path is not None else "./.backup{}".format(
            support.get_time())
        self.filter_user = filter_user
        self.filter_status = filter_status
        self.n_statuses = n_statuses
        self.attempts = attempts
        self.backup = backup
        self._verbose = verbose
        # verbosity function
        self.verboseprint = print if self._verbose else lambda *args: None

        # collector needed for online data collection
        self.collector = collector

        self.start_time = 0
        self.last_backup = 0
        self.count = 0
        self.start_time = support.get_time()
        self.last_backup = self.start_time
        self.file = None
        self._closed = False
Beispiel #2
0
    def on_data(self, raw_data):
        """ Called when new data is available """

        logging.debug("New data received..")

        # if enough time was passed stop streaming or enough data was collected
        if self._closed:
            # if file has been opened, close it
            if self.file is not None:
                self.file.close()
                self.file = None

            duration = self.time_limit if self.time_limit is not None else support.get_time(
            ) - self.start_time

            self.verboseprint(
                "Streaming duration = {} seconds".format(duration))
            logging.debug("Streaming terminated at {}".format(
                support.get_date()))
            logging.debug("Streaming duration = {} seconds".format(duration))

            # stop connection to w/ streaming server
            return False
        elif self.file is not None:
            # print the raw data on the file
            self.file.write(raw_data)
            self.file.write("\n")

        # call on_data of the superclass
        super(OnlineStreamer, self).on_data(raw_data=raw_data)
Beispiel #3
0
    def on_status(self, status):
        """ called when raw data is received from stream """

        if self.check_backup():
            self.last_backup = support.get_time()
            self.collector.save_dataset(path=self.backup_path)

        self.collector.process(screen_name=status.user.screen_name,
                               filter_account=self.filter_user,
                               filter_status=self.filter_status,
                               n_statuses=self.n_statuses)

        self.count += 1
        if (self.data_limit is not None and self.count > self.data_limit) or \
                (self.time_limit is not None and (support.get_time() - self.start_time) > self.time_limit):
            self._closed = True
Beispiel #4
0
    def check_backup(self):
        """ Checks whether is time to backup data """

        return self.backup is not None and (support.get_time() -
                                            self.last_backup) > self.backup
Beispiel #5
0
                            "favourites_count": get_attribute,
                            "statuses_count": get_attribute,
                            "created_at": lambda user, feature_name: get_timestamp(get_attribute(user, feature_name)),
                            "geo_enabled": get_attribute,
                            "lang": get_attribute,
                            "contributors_enabled": get_attribute,
                            "profile_background_color": get_attribute,
                            "profile_background_image_url_https": get_attribute,
                            "profile_background_tile": get_attribute,
                            "profile_image_url_https": get_attribute,
                            "profile_link_color": get_attribute,
                            "profile_text_color": get_attribute,
                            "profile_use_background_image": get_attribute,
                            "default_profile": get_attribute,
                            "default_profile_image": get_attribute,
                            "profile_crawled": lambda x, y: get_time(),
                            "is_suspended": lambda x, y: 0,
                            "following_followers_ratio": lambda user, _: user.friends_count / user.followers_count if user.followers_count != 0 else None,
                            "followers_following_ratio": lambda user, _: user.followers_count / user.friends_count if user.friends_count != 0 else None}


default_account_timeline_features = {"n_statuses_collected": lambda statuses_data, _: statuses_data.shape[0],
                                     "mean_status_length": lambda statuses_data, _: statuses_data["text_length"].mean(),
                                     "media_shared_urls": lambda statuses_data, _: reduce(lambda x,y: x+y, [], [x for x in statuses_data["media_urls"] if x is not None]),
                                     "mean_shared_media": lambda statuses_data, _: (len(reduce(lambda x,y: x+y, [], [x for x in statuses_data["media_urls"] if x is not None])) / statuses_data.shape[0]) if statuses_data.shape[0] != 0 else None,
                                     "quoted_user_ids": lambda statuses_data, _: [x for x in statuses_data["quoted_user_id"] if x is not None],
                                     "replied_status_ids": lambda statuses_data, _: [x for x in statuses_data["in_reply_to_status_id"] if x is not None],
                                     "replied_user_ids": lambda statuses_data, _: [x for x in statuses_data["in_reply_to_user_id"] if x is not None],
                                     "retweeted_status_ids": lambda statuses_data, _: [x for x in statuses_data["retweeted_status"] if x is not None],
                                     "retweeted_user_ids": lambda statuses_data, _: [x for x in statuses_data["retweeted_user_id"] if x is not None]}