def test_compute_average_locatio(self, tweet_with_place): pt = ProcessTweet(tweet_with_place) pt.compute_average_location() assert pt.processed_tweet['place'] == { 'average_location': [-105.14544, 40.192138], 'location_radius': 0.0 }
def test_pt(self, tweet): pt = ProcessTweet(tweet) assert pt.is_matching_project_locales() pt = ProcessTweet(tweet, project_locales=['en']) assert pt.is_matching_project_locales() pt = ProcessTweet(tweet, project_locales=['de']) assert not pt.is_matching_project_locales()
def process(self, tweet, retweet_count_increment=0.8): if not self.should_be_processed(tweet): return # get tokens pt = ProcessTweet(tweet, project_locales=self.project_locales) tokens = self.tokenize_tweet(tweet, pt) # determine count increment count_increment = 1 if pt.is_retweet: count_increment = retweet_count_increment # add tokens to queues self.add_to_queue(self.pq_counts_weighted, tokens, count_increment) if pt.is_retweet: self.add_to_queue(self.pq_counts_retweets, tokens, 1) else: self.add_to_queue(self.pq_counts_tweets, tokens, 1)
def test_has_coords(self, tweet, tweet_with_coordinates): pt = ProcessTweet(tweet) assert not pt.has_coordinates pt = ProcessTweet(tweet_with_coordinates) assert pt.has_coordinates
def test_has_place(self, tweet, tweet_with_place): pt = ProcessTweet(tweet) assert not pt.has_place pt = ProcessTweet(tweet_with_place) assert pt.has_place
def test_should_be_annotated(self, tweet): pt = ProcessTweet(tweet, project_locales=['en']) pt.process() assert pt.should_be_annotated() pt = ProcessTweet(tweet, project_locales=['de']) assert not pt.should_be_annotated()
def handle_tweet(tweet, send_to_es=True, use_pq=True, debug=False, store_unmatched_tweets=False): logger = get_task_logger(__name__) if debug: logger.setLevel(logging.DEBUG) # reverse match to find project rtm = ReverseTweetMatcher(tweet=tweet) candidates = rtm.get_candidates() tweet_id = tweet['id_str'] # open Redis connection only once # redis = Redis() # connection = redis.get_connection() connection = None if len(candidates) == 0: # Could not match keywords. This might occur quite frequently e.g. when tweets are collected accross different languages/keywords logger.info(f'Tweet {tweet_id} could not be matched against any existing projects.') if store_unmatched_tweets: # store to separate file for later analysis with open(os.path.join(config.PROJECT_ROOT, 'logs', 'reverse_match_errors', f'{tweet_id}.json'), 'w') as f: json.dump(tweet, f) return # queue up for s3 upload and add to priority queue logger.info("SUCCESS: Found {} project(s) ({}) as a matching project for tweet".format(len(candidates), ', '.join(candidates))) redis_queue = RedisS3Queue(connection=connection) es_queue = ESQueue(connection=connection) stream_config_reader = ProjectConfig() for project in candidates: stream_config = stream_config_reader.get_config_by_slug(project) if stream_config['storage_mode'] == 'test_mode': logger.debug('Running in test mode. Not sending to S3 or ES.') return # add tracking info tweet['_tracking_info'] = stream_config_reader.get_tracking_info(project) tweet['_tracking_info']['matching_keywords'] = rtm.matching_keywords[project] # Queue up on Redis for subsequent upload redis_queue.push(json.dumps(tweet).encode(), project) # preprocess tweet pt = ProcessTweet(tweet, project_locales=stream_config['locales']) pt.process() # Possibly add tweet to trending tweets if stream_config['compile_trending_tweets']: trending_tweets = TrendingTweets(project, project_locales=stream_config['locales'], connection=connection) trending_tweets.process(tweet) # Extract trending topics if stream_config['compile_trending_topics']: trending_topics = TrendingTopics(project, project_locales=stream_config['locales'], project_keywords=stream_config['keywords'], connection=connection) trending_topics.process(tweet) if stream_config['compile_data_dump_ids'] and config.ENV == 'prd': data_dump_ids = DataDumpIds(project, connection=connection) data_dump_ids.add(tweet_id) if pt.has_place: data_dump_ids = DataDumpIds(project, mode='has_place', connection=connection) data_dump_ids.add(tweet_id) if pt.has_coordinates: data_dump_ids = DataDumpIds(project, mode='has_coordinates', connection=connection) data_dump_ids.add(tweet_id) if use_pq and pt.should_be_annotated(): # add to Tweet ID queue for crowd labelling logger.info(f'Add tweet {tweet_id} to priority queue...') processed_tweet = pt.get_processed_tweet() tid = TweetIdQueue(stream_config['es_index_name'], priority_threshold=3, connection=connection) processed_tweet['text'] = pt.get_text(anonymize=True) tid.add_tweet(tweet_id, processed_tweet, priority=0) if stream_config['image_storage_mode'] != 'inactive': pm = ProcessMedia(tweet, project, image_storage_mode=stream_config['image_storage_mode']) pm.process() if send_to_es and stream_config['storage_mode'] in ['s3-es', 's3-es-no-retweets']: if rtm.is_retweet and stream_config['storage_mode'] == 's3-es-no-retweets': # Do not store retweets on ES return # send to ES processed_tweet = pt.get_processed_tweet() logger.debug(f'Pushing processed with id {tweet_id} to ES queue') es_tweet_obj = {'processed_tweet': processed_tweet, 'id': tweet_id} if len(stream_config['model_endpoints']) > 0: # prepare for prediction es_tweet_obj['text_for_prediction'] = {'text': pt.get_text(anonymize=True), 'id': tweet_id} es_queue.push(json.dumps(es_tweet_obj).encode(), project)