def test_compute_average_locatio(self, tweet_with_place):
     pt = ProcessTweet(tweet_with_place)
     pt.compute_average_location()
     assert pt.processed_tweet['place'] == {
         'average_location': [-105.14544, 40.192138],
         'location_radius': 0.0
     }
 def test_pt(self, tweet):
     pt = ProcessTweet(tweet)
     assert pt.is_matching_project_locales()
     pt = ProcessTweet(tweet, project_locales=['en'])
     assert pt.is_matching_project_locales()
     pt = ProcessTweet(tweet, project_locales=['de'])
     assert not pt.is_matching_project_locales()
예제 #3
0
 def process(self, tweet, retweet_count_increment=0.8):
     if not self.should_be_processed(tweet):
         return
     # get tokens
     pt = ProcessTweet(tweet, project_locales=self.project_locales)
     tokens = self.tokenize_tweet(tweet, pt)
     # determine count increment
     count_increment = 1
     if pt.is_retweet:
         count_increment = retweet_count_increment
     # add tokens to queues
     self.add_to_queue(self.pq_counts_weighted, tokens, count_increment)
     if pt.is_retweet:
         self.add_to_queue(self.pq_counts_retweets, tokens, 1)
     else:
         self.add_to_queue(self.pq_counts_tweets, tokens, 1)
 def test_has_coords(self, tweet, tweet_with_coordinates):
     pt = ProcessTweet(tweet)
     assert not pt.has_coordinates
     pt = ProcessTweet(tweet_with_coordinates)
     assert pt.has_coordinates
 def test_has_place(self, tweet, tweet_with_place):
     pt = ProcessTweet(tweet)
     assert not pt.has_place
     pt = ProcessTweet(tweet_with_place)
     assert pt.has_place
 def test_should_be_annotated(self, tweet):
     pt = ProcessTweet(tweet, project_locales=['en'])
     pt.process()
     assert pt.should_be_annotated()
     pt = ProcessTweet(tweet, project_locales=['de'])
     assert not pt.should_be_annotated()
예제 #7
0
def handle_tweet(tweet, send_to_es=True, use_pq=True, debug=False, store_unmatched_tweets=False):
    logger = get_task_logger(__name__)
    if debug:
        logger.setLevel(logging.DEBUG)
    # reverse match to find project
    rtm = ReverseTweetMatcher(tweet=tweet)
    candidates = rtm.get_candidates()
    tweet_id = tweet['id_str']
    # open Redis connection only once
    # redis = Redis()
    # connection = redis.get_connection()
    connection = None
    if len(candidates) == 0:
        # Could not match keywords. This might occur quite frequently e.g. when tweets are collected accross different languages/keywords
        logger.info(f'Tweet {tweet_id} could not be matched against any existing projects.')
        if store_unmatched_tweets:
            # store to separate file for later analysis
            with open(os.path.join(config.PROJECT_ROOT, 'logs', 'reverse_match_errors', f'{tweet_id}.json'), 'w') as f:
                json.dump(tweet, f)
        return
    # queue up for s3 upload and add to priority queue
    logger.info("SUCCESS: Found {} project(s) ({}) as a matching project for tweet".format(len(candidates), ', '.join(candidates)))
    redis_queue = RedisS3Queue(connection=connection)
    es_queue = ESQueue(connection=connection)
    stream_config_reader = ProjectConfig()
    for project in candidates:
        stream_config = stream_config_reader.get_config_by_slug(project)
        if stream_config['storage_mode'] == 'test_mode':
            logger.debug('Running in test mode. Not sending to S3 or ES.')
            return
        # add tracking info
        tweet['_tracking_info'] = stream_config_reader.get_tracking_info(project)
        tweet['_tracking_info']['matching_keywords'] = rtm.matching_keywords[project]
        # Queue up on Redis for subsequent upload
        redis_queue.push(json.dumps(tweet).encode(), project)
        # preprocess tweet
        pt = ProcessTweet(tweet, project_locales=stream_config['locales'])
        pt.process()
        # Possibly add tweet to trending tweets
        if stream_config['compile_trending_tweets']:
            trending_tweets = TrendingTweets(project, project_locales=stream_config['locales'], connection=connection)
            trending_tweets.process(tweet)
        # Extract trending topics
        if stream_config['compile_trending_topics']:
            trending_topics = TrendingTopics(project, project_locales=stream_config['locales'], project_keywords=stream_config['keywords'], connection=connection)
            trending_topics.process(tweet)
        if stream_config['compile_data_dump_ids'] and config.ENV == 'prd':
            data_dump_ids = DataDumpIds(project, connection=connection)
            data_dump_ids.add(tweet_id)
            if pt.has_place:
                data_dump_ids = DataDumpIds(project, mode='has_place', connection=connection)
                data_dump_ids.add(tweet_id)
            if pt.has_coordinates:
                data_dump_ids = DataDumpIds(project, mode='has_coordinates', connection=connection)
                data_dump_ids.add(tweet_id)
        if use_pq and pt.should_be_annotated():
            # add to Tweet ID queue for crowd labelling
            logger.info(f'Add tweet {tweet_id} to priority queue...')
            processed_tweet = pt.get_processed_tweet()
            tid = TweetIdQueue(stream_config['es_index_name'], priority_threshold=3, connection=connection)
            processed_tweet['text'] = pt.get_text(anonymize=True)
            tid.add_tweet(tweet_id, processed_tweet, priority=0)
        if stream_config['image_storage_mode'] != 'inactive':
            pm = ProcessMedia(tweet, project, image_storage_mode=stream_config['image_storage_mode'])
            pm.process()
        if send_to_es and stream_config['storage_mode'] in ['s3-es', 's3-es-no-retweets']:
            if rtm.is_retweet and stream_config['storage_mode'] == 's3-es-no-retweets':
                # Do not store retweets on ES
                return
            # send to ES
            processed_tweet = pt.get_processed_tweet()
            logger.debug(f'Pushing processed with id {tweet_id} to ES queue')
            es_tweet_obj = {'processed_tweet': processed_tweet, 'id': tweet_id}
            if len(stream_config['model_endpoints']) > 0:
                # prepare for prediction
                es_tweet_obj['text_for_prediction'] = {'text': pt.get_text(anonymize=True), 'id': tweet_id}
            es_queue.push(json.dumps(es_tweet_obj).encode(), project)