def backfill(path=None, db_name='slpng_giants', collection_name='tweets'): tweets = create_collection(db_name, collection_name) config = Config.get() save_dir = os.path.expanduser(path or config['output_dir']) paths = glob.glob(os.path.join(save_dir, '**', '*.t*'), recursive=True) t0 = datetime.now() for idx, path in enumerate(paths): if os.path.basename(os.path.dirname(path)) != 'stream': continue try: for lidx, line in enumerate(utils.readlines(path)): try: data = json.loads(line) except Exception: TwiLogger.exception( f'Backfill: Unable to read line {path}:{lidx + 1}') continue else: if data.get('delete'): continue if os.path.basename(os.path.dirname(path)) == 'stream': data = utils.stream_to_search(data) data = utils.timestamp_to_datetime(data) tweets.replace_one({'id': data['id']}, data, upsert=True) t_delta = datetime.now() - t0 average = t_delta / (idx + 1) remaining = str((len(paths) - (idx + 1)) * average).split('.')[0] TwiLogger.info(f'{idx + 1}/{len(paths)} ' f'{remaining} ' f'{os.sep.join(path.split(os.sep)[-3:])}') except Exception: TwiLogger.exception(f'Backfill: Unable to read file: {path}')
def id_to_screenname(self): now = datetime.now() time_since_lookup = now - (self._id_to_screenname_time or now) expiry = timedelta(minutes=15) if self._id_to_screenname and time_since_lookup <= expiry: return self._id_to_screenname for follow_id in self.follow: user = self.api.get_user(follow_id) self._id_to_screenname[follow_id] = '@{}'.format(user.screen_name) self._id_to_screenname_time = datetime.now() TwiLogger.info(self._id_to_screenname) return self._id_to_screenname
def on_data(self, json_data): """ Defines the actions to take on data capture. Caching all available user data and writing tweet data to disk. Args: json_data (str): String containing tweet data on JSON format Returns: bool: True if successful """ self._rate_limit_retry_count = 0 os.makedirs(self.output_dir, exist_ok=True) file_path = os.path.join(self.output_dir, self.file_name) data = json.loads(json_data) if data.get('created_at'): users = utils.collect_key_values('user', data) for user in users: user['recorded_at'] = data['created_at'] self.users[user['id_str']] = user if self.config.get('full_user_mentions', False): self.update_mentions(data) # Add tweet to MongoDB if self.config.get('use_mongo', True) and self.mongo_collection: try: mongo_data = copy.deepcopy(data) mongo_data = utils.timestamp_to_datetime(mongo_data) mongo_data = utils.stream_to_search(mongo_data) self.mongo_collection.replace_one( {'id': mongo_data['id']}, mongo_data, upsert=True ) except Exception: TwiLogger.exception( 'Twicorder Listener: Unable to connect to MongoDB: ' ) self._data.append(data) utils.write(json.dumps(data) + '\n', file_path) timestamp = '{:%d %b %Y %H:%M:%S}'.format(datetime.now()) tweet = self.get_full_text(data) if not tweet: return True user = data.get('user', {}).get('screen_name', '-') oneline_tweet = tweet.replace('\n', ' ') TwiLogger.info(f'{timestamp}, @{user}: {oneline_tweet}') return True
def run(self): """ Fetches query from queue and executes it. """ while True: self._query = self.queue.get() if self.query is None: TwiLogger.info(f'Terminating thread "{self.name}"') break while not self.query.done: try: self.query.run() except Exception: import traceback TwiLogger.exception(traceback.format_exc()) break TwiLogger.info(self.query.fetch_log()) time.sleep(.2) time.sleep(.5) self.queue.task_done()
def add(self, query): """ Finds appropriate queue for given end point and adds it. Args: query (BaseQuery): Query object """ queue = self.get_queue(query.endpoint) if query in queue.queue: TwiLogger.info(f'Query with ID {query.uid} is already in the queue.') return thread = self.threads.get(query.endpoint) if thread and thread.query == query: TwiLogger.info(f'Query with ID {query.uid} is already running.') return queue.put(query) TwiLogger.info(query)
def track(self): track_list = [t for t in self.config.get('track') or [] if t] or None if track_list and self.follow_also_tracks: track_list += self.id_to_screenname.values() TwiLogger.info('Tracking: ', track_list) return track_list