def advance_search_dataset(q, f, num, event_id): _, db, _ = get_spider_config() collection = db.paper tweetCriteria = got.manager.TweetCriteria().setQuerySearch(q).setTweetType( f).setMaxTweets(num) tweets = got.manager.TweetManager.getTweets(tweetCriteria) for tweet in tweets: if collection.find_one({'_id': tweet['id']}) == None: collection.insert_one({ '_id': tweet['id'], 'tweet': tweet, 'event_id': json.loads(event_id, object_hook=json_util.object_hook), 'f': f, 'q': q }) db.close()
import re import json import fire import time from tqdm import tqdm from datetime import datetime, timedelta from collections import Counter from Config import get_spider_config _, db, r = get_spider_config() users = [ i['tweet']['user']['screen_name'] for i in db.korea_missile.find({}, {"tweet.user.screen_name": 1}) ] freq_users = [i[0] for i in Counter(users).most_common() if i[1] >= 5] def get_query_str(loc, triggers, target): # start = (now - time_delta).strftime("%Y-%m-%d %H:%M:%S") # now_str = now.strftime("%Y-%m-%d %H:%M:%S") return '(' + loc + ')' + ' ' + '(' + ' OR '.join( triggers) + ')' + ' ' + '(' + target + ')' def get_task(): locs = ["North Korea"] triggers = ["test", "launch", "fire"] targets = ["messile", "satellite", "rocket", "nuclear"] now = datetime.now() WAIT_TIME_MINUTES = 15 while True: