def search(search): twitter_utils = twitter.Twitter() credentials = session.get('credentials') search = [search] try: tso = TwitterSearchOrder() tso.set_language('en') tso.set_keywords(search) tso.set_include_entities(False) #Remove later if u want to use images query = TwitterSearch(consumer_key=session['consumer_key'], consumer_secret=session['consumer_secret'], access_token=session['token'], access_token_secret=session['token_secret']) response = query.search_tweets(tso) t_range = datetime.now(pytz.utc) - timedelta(minutes=TIME_RANGE) tweets = [ t for t in response['content']['statuses'] if twitter_utils.get_date(t) >= t_range ] print("Current rate-limiting status: " + str(query.get_metadata()['x-rate-limit-reset'])) return render_template( "page.html", search=search, tweets=tweets, music_config=twitter_utils.get_music_config(tweets)) except TwitterSearchException as e: return str(e)
def twitSearch(tweetLastSeen): #print("Debug: In function twitSearch()") tweetSearchCount = 0 try: tso = TwitterSearchOrder() #tso.set_keywords(['disaster','banking'], or_operator = True) tso.set_keywords(['disaster','poverty','banking','homeless'], or_operator = True) #tso.add_keyword('poverty') #tso.add_keyword('disaster') #tso.add_keyword('banking') tso.set_language('en') tso.set_include_entities(False) tso.set_result_type('recent') if tweetLastSeen > 0: print("Debug: I have a previous value for lastseen_id, setting since_id() to: %s and asking for 100 results" % tweetLastSeen) tso.set_since_id(tweetLastSeen) tso.set_count(100) else: print("Debug: No value for lastseen_id, asking for one result") tso.set_count(1) print("Debug: The tso search string looks like this") print(tso.create_search_url()) ts = TwitterSearch( consumer_key = '', consumer_secret = '', access_token = '', access_token_secret = '') ## def my_callback_function(current_ts_instance): # accepts ONE argument: an instance of TwitterSearch ## #print("In callback function") ## queries, tweets_seen = current_ts_instance.get_statistics() ## #query = current_ts_instance.get_statistics() ## print("%s queries & %s tweets seen" %(queries, tweets_seen)) ## print("%s query" %(query)) ## #if queries > 0 and (queries % 5) == 0: # trigger delay every 5th query ## #print("Thats 5 queries. Sleeping for 60 secs") ## #time.sleep(60) # sleep for 60 seconds #queries, tweets_seen = ts.get_statistics() #print("Debug: %s queries & %s tweets seen" %(queries, tweets_seen)) #print("Debug: About to iterate over search results from TwitterSearch instance") #for tweet in ts.search_tweets_iterable(tso, callback=my_callback_function): tweets_seen = 0 currentTweetID = 0 lastTweetID = 0 for tweet in ts.search_tweets_iterable(tso): queries, tweets_seen_by_stats = ts.get_statistics() print("Debug: stats: %s queries & %s tweets seen" %(queries, tweets_seen_by_stats)) rateLimitRemaining = ts.get_metadata()['x-rate-limit-remaining'] rateLimitReset = ts.get_metadata()['X-Rate-Limit-Reset'] print("Debug: Rate limit resets at %s and has %s queries remaining" %(datetime.datetime.fromtimestamp(int(rateLimitReset)), rateLimitRemaining)) currentTweetID = tweet['id'] print("Debug: Current tweetID %s" % currentTweetID) if currentTweetID > lastTweetID: print("Debug: Seen a more recent tweetID, updating lastTweetID") lastTweetID = currentTweetID tweets_seen = tweets_seen_by_stats break print( 'Debug: In tweet iter @%s tweet id: %s' % ( tweet['user']['screen_name'], tweet['id'] ) ) tweets_seen = tweets_seen + 1 print("Debug: tweets_seens: %s" % tweets_seen) print('Debug: about to return tweet ID @%s' % lastTweetID ) global twitterSearchCount twitterSearchCount = twitterSearchCount + 1 print("Debug: This is twitter search number: %s" % twitterSearchCount) return lastTweetID, tweets_seen except TwitterSearchException as e: print(e)
class JCrawler(object): 'Twitter crawler for Japanese tweets' tso = TwitterSearchOrder() ts = None con = None cursor = None DAILY_THRESHOLD = 50 IDLE_SLEEP_TIME = 60 total_api_queries = 0 queries_processed = 0 total_tweets_grabbed = 0 max_tweets = -1 global_min_id = -1 global_max_id = -1 depth = 10 seed_keywords_array = None logfile = None exact_tweet_re = re.compile('^RT @') queries_to_update = 0 consumer_key = '' consumer_secret = '' access_token = '' access_token_secret = '' def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret, seed_keywords_array=None, depth=1000, global_min_id=-1, global_max_id=-1, max_tweets=-1): # self.load_settings() self.depth = depth self.seed_keywords_array = seed_keywords_array self.global_max_id = global_max_id self.global_min_id = global_min_id self.max_tweets = max_tweets self.consumer_key = consumer_key self.consumer_secret = consumer_secret self.access_token = access_token self.access_token_secret = access_token_secret # Open the logfile self.logfile = codecs.open("log/tweetlog", 'a', encoding='utf-8') if self.max_tweets == -1: self.max_tweets = float("inf") # def load_settings(self, filepath=""): # if(filepath == ""): # filepath = "settings.cfg" # config = ConfigParser.RawConfigParser() # config.read(filepath) def log(self, message): # print(message) self.logfile.write(message) def get_old_queries_from_DB(self): self.checkConnection() db_query = u"""SELECT * FROM `queries` WHERE `next_update` <= %s AND total_updates > 0""" dict_cursor = self.con.cursor(pymysql.cursors.DictCursor) dict_cursor.execute(db_query, [int(time.time())]) self.queries_to_update += dict_cursor.rowcount # Get the results rows = dict_cursor.fetchall() # query_array = [] # for row in rows: # query_array.append(row) return rows def update_queries(self): query_array = self.get_old_queries_from_DB() for query in query_array: if(self.total_tweets_grabbed >= self.max_tweets): break if "min_id" in query: min_id = query["min_id"] else: min_id = None if "max_id" in query: max_id = query["max_id"] else: max_id = None num_tweets_fetched = scrape_search_query(query, self.depth, min_id, max_id) calculate_next_update(query, num_tweets_fetched) def get_new_queries_from_DB(self): self.checkConnection() db_query = u"""SELECT * FROM `queries` WHERE `next_update` <= %s AND total_updates = 0""" dict_cursor = self.con.cursor(pymysql.cursors.DictCursor) dict_cursor.execute(db_query, [int(time.time())]) self.queries_to_update += dict_cursor.rowcount # Get the results rows = dict_cursor.fetchall() return rows def process_new_queries(self, query_string_array=None): if(query_string_array is None): query_array = self.get_new_queries_from_DB() else: query_array = [] for string in query_string_array: query_array.append({'query': string}) for query in query_array: if(self.total_tweets_grabbed >= self.max_tweets): break num_tweets_fetched = self.scrape_search_query(query, self.depth) self.calculate_next_update(query, num_tweets_fetched) def connect_DB(self, host, dbuser, dbpass, dbname): try: conn = pymysql.connect(host, dbuser, dbpass, dbname, charset='utf8mb4', use_unicode=True, init_command="set names utf8mb4") testcursor = conn.cursor() testcursor.execute("SELECT VERSION()") results = testcursor.fetchone() # Check if anything at all is returned if results: self.log('Connected to DB, version: %s' % results) else: self.log('Connected to DB failed') # Disable binary logging to save space testcursor.execute("SET sql_log_bin = 0") results = testcursor.fetchone() testcursor.execute("SET NET_WRITE_TIMEOUT = 2147480") results = testcursor.fetchone() testcursor.execute("SET WAIT_TIMEOUT = 2147480") results = testcursor.fetchone() self.con = conn return conn except pymysql.Error as e: self.log("\nError %d: %s" % (e.args[0], e.args[1])) def start(self): try: # self.log('Connected to DB: %s' % self.con.info()) # while True: while self.total_tweets_grabbed < self.max_tweets: self.ReportStatus(1) self.log("\nUpdating old queries") self.update_queries() self.log("\nProcessing new queries") if(self.seed_keywords_array is not None): self.process_new_queries(self.seed_keywords_array) # Only process seed keywords once self.seed_keywords_array = None else: self.process_new_queries() # Keep script from wasting DB resources continually checking for queries to update if(self.queries_to_update == 0): self.ReportStatus(0) time.sleep(self.IDLE_SLEEP_TIME) # while AnalysisIsRunning(): # self.ReportStatus(0) # time.sleep(self.IDLE_SLEEP_TIME) if (not self.con.open): self.con = self.connect_DB('localhost', 'root', 'preyfromslyfluxlinus', 'jcrawler') #self.queries_to_update = 0 except TwitterSearchException as e: if(e.code == 420): self.log("\nBlaze it for a hot sec: we're being rate limited. Waiting 15 minutes...") self.ReportStatus(2) time.sleep(900) self.log("\nAttempting to restart scrape process...") return self.scrape_search_query(search_query, depth, min_id, max_id) elif(e.code == 403 or e.code == 503): self.log("\nTwitter is either experiencing high load or we are being rate limited. Waiting 1 hour...") self.ReportStatus(2) time.sleep(3600) self.log("\nAttempting to restart scrape process...") return self.scrape_search_query(search_query, depth, min_id, max_id) else: # self.log(e) logging.exception("TwitterSearch had an error:") print(e) except pymysql.Error as e: self.log("\nError %d: %s" % (e.args[0], e.args[1])) else: self.log("\nQuitting") if self.con: self.con.close() if self.logfile: self.logfile.close() def restart(self): if(self.con.open): self.con.close() self.cursor = None self.total_tweets_grabbed = 0 self.logfile = codecs.open("log/tweetlog", 'a', encoding='utf-8') self.start() def checkConnection(self): if(self.con is None or not self.con.open): self.con = self.connect_DB('localhost', 'root', 'preyfromslyfluxlinus', 'jcrawler') self.cursor = self.con.cursor() def ReportStatus(self, status, msg=""): 'Posts info about queries processed, currently processing, etc.' self.checkConnection() app = "jcrawler" if(status == 1): # Currently running variable = "processing" value = "1" elif(status == 0): # Done running variable = "processing" value = "0" elif(status == 2): # Sleeping variable = "processing" value = "2" controlQuery = u"""INSERT INTO `control`(`app`, `variable`, `value`) VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE value=VALUES(value)""" progressQuery = u"""INSERT INTO `progress`(`app`, `processed`) VALUES(%s, %s) """ reportcursor = self.con.cursor() reportcursor.execute(controlQuery, (app, variable, value)) reportcursor.execute(progressQuery, (app, self.total_tweets_grabbed)) self.con.commit() reportcursor.close() def AnalysisIsRunning(): self.checkConnection() isRunning = False statusQuery = u""" SELECT `value` FROM `control` WHERE `app`='lexicalyzer' AND `variable`='processing' """ statuscursor = self.con.cursor() statuscursor.execute(statusQuery) value = self.cursor.fetchone() if(value[0] > 0): isRunning = True else: isRunning = False statuscursor.close() return isRunning def add_queries_to_DB(self, query_array): self.checkConnection() for query_hash in query_array: # exist_stmt = u""" # SELECT * FROM `queries` # WHERE `sha256_hash` # =%s # """ query = query_array[query_hash] prepstatement = u"""INSERT INTO `queries`(`max_id`, `min_id`,`query`, `last_update`, `sha256_hash`, `next_update`, `penalty_level`, `total_tweets`) VALUES ( %s,%s,%s,%s,%s,%s,%s, %s) ON DUPLICATE KEY UPDATE max_id=VALUES(max_id), min_id=VALUES(min_id), last_update=VALUES(last_update) , next_update=VALUES(next_update), penalty_level= VALUES(penalty_level), total_tweets=VALUES(total_tweets)""" # self.log("\nQuery: %s" % query) self.cursor.execute(prepstatement, (query["max_id"], query["min_id"], query["query"], query["last_update"], query["sha256_hash"], query["next_update"], query["penalty_level"], query["total_tweets"])) def add_query_to_DB(self, query): prepstatement = u"""INSERT INTO `queries`(`max_id`, `min_id`,`query`, `last_update`, `sha256_hash`, `next_update`, `penalty_level`, `total_tweets`) VALUES ( %s,%s,%s,%s,%s,%s,%s, %s) ON DUPLICATE KEY UPDATE max_id=VALUES(max_id), min_id=VALUES(min_id), last_update=VALUES(last_update) , next_update=VALUES(next_update), penalty_level= VALUES(penalty_level), total_tweets=VALUES(total_tweets)""" self.cursor.execute(prepstatement, (query["max_id"], query["min_id"], query["query"], query["last_update"], query["sha256_hash"], query["next_update"], query["penalty_level"], query["total_tweets"])) def is_exact_retweet(self, tweet): return self.exact_tweet_re.match(tweet["text"]) or tweet["retweeted_status"]["text"] == tweet["text"] def gen_query_hash(self, query_text): return hashlib.sha256(query_text.encode("utf8")).hexdigest() def add_to_queries_list(self, query_text, tweet, queries_array): query_hash = self.gen_query_hash(query_text) if query_hash not in queries_array or queries_array[query_hash] is None: self.log("\nAdding new query to list") queries_array[query_hash] = {"query": query_text, "sha256_hash": query_hash, "max_id": tweet["id"], "min_id": tweet["id"], "next_update": int(time.time()), "penalty_level": 0, "total_tweets": 1, "last_update": int(time.time())} else: if(tweet["id"] >= queries_array[query_hash]["max_id"]): queries_array[query_hash]["max_id"] = tweet["id"] elif(tweet["id"] <= queries_array[query_hash]["min_id"]): queries_array[query_hash]["min_id"] = tweet["id"] queries_array[query_hash]["total_tweets"] += 1 queries_array[query_hash]["last_update"] = int(time.time()) def calculate_next_update(self, query, num_tweets_fetched): if num_tweets_fetched < self.DAILY_THRESHOLD: if("penalty_level" in query and query["penalty_level"] is not None): query["penalty_level"] += 1 else: query["penalty_level"] = 1 # Wait penalty_level days until next update query["next_update"] = int(time.time()) + (query["penalty_level"] * 24 * 60 * 60) self.log("\nPenalizing query by %s days. Next update: %s" % (query["penalty_level"], query["next_update"])) else: query["penalty_level"] = 0 # Wait one day query["next_update"] = int(time.time()) + (24 * 60 * 60) def scrape_search_query(self, search_query, depth, min_id=-1, max_id=-1): try: self.log("\nScraping...") # self.con = connect_DB('localhost', # 'root', # 'preyfromslyfluxlinus', # 'jcrawler') self.cursor = self.con.cursor() self.tso.set_keywords([search_query["query"]]) self.tso.set_language('ja') # search for japanese tweets only self.tso.set_locale('ja') if(self.global_min_id != -1): self.tso.set_since_id(self.global_min_id) elif(min_id != -1): self.tso.set_since_id(min_id) if(self.global_max_id != -1): self.tso.set_max_id(global_max_id) elif(max_id != -1): self.tso.set_max_id(max_id) sleep_time = 1 last_num_of_queries = 0 this_query_total_tweets = 0 new_queries = {} # it's about time to create a TwitterSearch object with our secret # tokens if self.ts is None: self.ts = TwitterSearch( consumer_key='lHw0Fte6wfJnzxMrA9nRxqJJN', consumer_secret='UQmsX0hC9wvuzhaMLZ4OpB' 'FqfI4vYdMrHSNt0FYEmFGcsYU0iK', access_token='4920693635-jk0qpriUrztwA2' 'a7dOwp4EhQI86qHt4xLbq7uPU', access_token_secret='3mTFWM8leIXQzaiqnH' 'hwbspN6BzB5O8qMYKDnrgqHCKBz' ) start = int(time.time()) for tweet in self.ts.search_tweets_iterable(self.tso): self.ReportStatus(1) if (('coordinates' in tweet and tweet["coordinates"] is not None and 'coordinates' in tweet["coordinates"] and tweet["coordinates"]["coordinates"] and tweet["coordinates"]["coordinates"][0] is not None)): has_coordinates = 1 longitude = tweet["coordinates"]["coordinates"][0] latitude = tweet["coordinates"]["coordinates"][1] else: has_coordinates = 0 longitude = 181 latitude = 91 if (('retweeted_status' in tweet and tweet["retweeted_status"] is not None and 'id' in tweet["retweeted_status"] and tweet["retweeted_status"]["id"] is not None)): is_retweet = 1 retweet_id_int = tweet["retweeted_status"]["id"] retweet_id_str = tweet["retweeted_status"]["id_str"] if(self.is_exact_retweet(tweet)): is_exact_retweet = 1 # Don't bother to save exact retweets anymore continue else: is_exact_retweet = 0 else: is_retweet = 0 retweet_id_int = -1 retweet_id_str = "" is_exact_retweet = 0 this_query_total_tweets += 1 self.total_tweets_grabbed += 1 # Write to DB # Prepare and execute raw_tweet query prepstatement = u"""INSERT INTO `raw_tweets`(`id`, `text` , `created_at`, `lang`, `retweet_count`, `source` , `user_id`, `has_coordinates`, `longitude` , `latitude`, `id_int`, `user_id_int`, `is_retweet` , `retweet_id_int`, `retweet_id`, `is_exact_retweet`) VALUES ( %s,%s,%s,%s,%s,%s,%s,%s, %s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE id=id""" self.cursor.execute(prepstatement, (tweet["id_str"], tweet["text"], tweet["created_at"], tweet["lang"], tweet["retweet_count"], tweet["source"], tweet["user"]["id_str"], has_coordinates, longitude, latitude, tweet["id"], tweet["user"]["id"], is_retweet, retweet_id_int, retweet_id_str, is_exact_retweet)) rawjsonstmt = u"""INSERT INTO `raw_tweets_json`(`id`, `id_int` , `raw_json`) VALUES ( %s,%s,%s) ON DUPLICATE KEY UPDATE id=id""" self.cursor.execute(rawjsonstmt, (tweet["id_str"], tweet["id"], json.dumps(tweet))) # Also store the user data user = tweet["user"] userstatement = u"""INSERT INTO `twitter_users`(`user_id`, `created_at`, `description` , `followers_count`, `friends_count`, `geo_enabled`, `lang` , `location` , `name`, `protected`, `screen_name`, `statuses_count` , `time_zone` , `url`, `utc_offset` , `verified`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s ,%s,%s) ON DUPLICATE KEY UPDATE `description`= VALUES(`description`) , `friends_count`=VALUES(`friends_count`) , `geo_enabled`=VALUES(`geo_enabled`) , `lang`=VALUES(`lang`) , `location`=VALUES(`location`) , `name`=VALUES(`name`) , `protected`=VALUES(`protected`) , `screen_name`=VALUES(`screen_name`) , `statuses_count`=VALUES(`statuses_count`) , `time_zone`=VALUES(`time_zone`) , `url`=VALUES(`url`) , `utc_offset`=VALUES(`utc_offset`) """ self.cursor.execute(userstatement, (user["id_str"], user["created_at"], user["description"], user["followers_count"], user["friends_count"], user["geo_enabled"], user["lang"], user["location"], user["name"], user["protected"], user["screen_name"], user["statuses_count"], user["time_zone"], user["url"], user["utc_offset"], user["verified"] )) hashtags = tweet["entities"]["hashtags"] for hashtag in hashtags: hashtagstatement = u""" INSERT INTO `hashtags`(`tweet_id` , `hashtag`, `start` , `end`) VALUES (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE tweet_id=tweet_id """ self.cursor.execute(hashtagstatement, ( tweet["id_str"], hashtag["text"], hashtag["indices"][0], hashtag["indices"][1] )) self.add_to_queries_list(hashtag["text"], tweet, new_queries) user_mentions = tweet["entities"]["user_mentions"] for mention in user_mentions: mentionstatement = u""" INSERT INTO `mentions`(`tweet_id` , `user_id` , `start` , `end`) VALUES (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE tweet_id=tweet_id """ self.cursor.execute(mentionstatement, ( tweet["id_str"], mention["id_str"], mention["indices"][0], mention["indices"][1] )) embedded_urls = tweet["entities"]["urls"] for url in embedded_urls: url_statement = u"""INSERT INTO `embedded_urls`( `tweet_id` , `url` , `start` , `end`) VALUES (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE tweet_id=tweet_id """ self.cursor.execute(url_statement, ( tweet["id_str"], url["url"], url["indices"][0], url["indices"][1] )) # TODO: Investigate crash due tolack of x-rate-limit-remaining key if 'x-rate-limit-remaining' in self.ts.get_metadata(): self.log("\nCurrent rate-limiting status: %s" % self.ts.get_metadata()['x-rate-limit-remaining']) # self.log('\n@%s tweeted: %s' # % (tweet['user']['screen_name'], tweet['text'])) self.add_queries_to_DB(new_queries) # Execute queries for real self.con.commit() # if current_num_of_queries is different # from last_num_of_queries, # then a new page was loaded by TwitterSearch current_num_of_queries = self.ts.get_statistics()[0] if not last_num_of_queries == current_num_of_queries: # Calculate current query rate now = int(time.time()) if('x-rate-limit-remaining' in self.ts.get_metadata()): reset = int(self.ts.get_metadata()['x-rate-limit-reset']) else: reset = 0 if((now - start) == 0): rate = this_query_total_tweets else: rate = this_query_total_tweets / (now - start) self.log("\nCurrent API query rate: %s queries / s" % rate) self.log("\nCUrrent tweets processed: %s" % self.total_tweets_grabbed) # Stop with 3 queries left before hitting the limit # just to be safe if ('x-rate-limit-remaining' in self.ts.get_metadata() and int(self.ts.get_metadata()['x-rate-limit-remaining']) <= 3 and now < reset): longsleeptime = reset - now + 60 self.log( '\nSleeping %s seconds \n(x-rate-limit-remaining= %s,' ' \nx-rate-limit-reset=%s,' ' \ntime now=%s)' % ( longsleeptime, self.ts.get_metadata()[ 'x-rate-limit-remaining'], self.ts.get_metadata()['x-rate-limit-reset'], now ) ) self.ReportStatus(2) time.sleep(longsleeptime) if(this_query_total_tweets >= depth): break elif ('x-rate-limit-remaining' in self.ts.get_metadata() and int(self.ts.get_metadata()['x-rate-limit-remaining']) <= 3 and now >= reset): # Wait a minute just in case there is a discrepency # between the rate limit we've been given and the # actual self.log( '\nSleeping 60 seconds (x-rate-limit-remaining= %s,' ' x-rate-limit-reset=%s,' ' time now=%s)' % ( self.ts.get_metadata()[ 'x-rate-limit-remaining'], self.ts.get_metadata()['x-rate-limit-reset'], now ) ) self.ReportStatus(2) time.sleep(60) if(this_query_total_tweets >= depth): break elif('x-rate-limit-remaining' not in self.ts.get_metadata()): self.log("\nx-rate-limit-remaining missing! Will sleep.") self.ReportStatus(2) time.sleep(900) else: last_num_of_queries = self.ts.get_statistics()[0] # Wait between queries self.log( '\nSleeping \n(current_num_of_queries= %s,' ' \nlast_num_of_queries=%s' ', \nx-rate-limit-remaining= %s,' ' \nx-rate-limit-reset=%s)' % ( current_num_of_queries, last_num_of_queries, self.ts.get_metadata()[ 'x-rate-limit-remaining'], self.ts.get_metadata()['x-rate-limit-reset'] ) ) self.ReportStatus(2) time.sleep(sleep_time) if(this_query_total_tweets >= depth): break # Update query counter last_num_of_queries = self.ts.get_statistics()[0] self.log("\nThis query total tweets processed: %s" % this_query_total_tweets) return this_query_total_tweets except TwitterSearchException as e: if(e.code == 420): self.log("\nBlaze it for a hot sec: we're being rate limited. Waiting 15 minutes...") self.ReportStatus(2) time.sleep(900) self.log("\nAttempting to restart scrape process...") return self.scrape_search_query(search_query, depth, min_id, max_id) elif(e.code == 403 or e.code == 503): self.log("\nTwitter is either experiencing high load or we are being rate limited. Waiting 1 hour...") self.ReportStatus(2) time.sleep(3600) self.log("\nAttempting to restart scrape process...") return self.scrape_search_query(search_query, depth, min_id, max_id) else: # self.log(e) logging.exception("TwitterSearch had an error:") print(e) except pymysql.Error as e: self.log("\nError %d: %s" % (e.args[0], e.args[1])) if(e.args[0] == 2006): self.log("\nDetected dead MySQL server, waiting 30 seconds for server to restart...") self.ReportStatus(2) time.sleep(30) self.log("\nAttempting to restart scrape process...") return self.scrape_search_query(search_query, depth, min_id, max_id) else: self.log("\nQuitting") if con: con.close() if self.logfile: self.logfile.close() return this_query_total_tweets
ts = TwitterSearch(consumer_key=keys[0], consumer_secret=keys[1], access_token=keys[2], access_token_secret=keys[3]) pp = pprint.PrettyPrinter(indent=4) latest_id = 0 num_tweets = 0 next_max_id = 0 try: for i in range(iters): # first query the Twitter API response = ts.search_tweets(tso) # print rate limiting status print "Current api calls remaining: %s" % ts.get_metadata( )['x-rate-limit-remaining'] old_num_tweets = num_tweets # check all tweets according to their ID for tweet in response['content']['statuses']: num_tweets += 1 tweet_id = tweet['id'] with open(args.out_file, 'a') as fout: json.dump(tweet, fout) fout.write('\n') #pprint.pprint(tweet,stream=fout) if (latest_id == 0): latest_id = tweet_id # current ID is lower than current next_max_id? if (tweet_id < next_max_id) or (next_max_id == 0):
class RestController: def __init__(self, config): self.config = config self.ts = TwitterSearch( consumer_key=config['TwitterAuth']['consumer_key'], consumer_secret=config['TwitterAuth']['consumer_secret'], access_token=config['TwitterAuth']['access_token'], access_token_secret=config['TwitterAuth']['access_token_secret']) self.searchParametersLog = [] self.tsqList = [] self.DBController = SearchDBController(config) #---Main public methods---- def addNewSearchParams(self, searchParams): # if search parameters are unique and complete, add them to the database search controller self.DBController.addSearchParams(searchParams) def basicSearch(self, collection_names): self._clearTsqList() self._readParamsFromDatabase(collection_names) self._updateQueriesAllowed() for tsq in self.tsqList: if tsq.queriesAllowed == 0: self.searchParametersLog.append("Failure") print("Not enough queries remaining") break try: tsq.performSearch() self._moveResultsToDatabase(tsq) except TwitterSearchException as e: self.searchParametersLog.append("TwitterSearch Failure") self.writeSearchLog('./') raise e except: self.searchParametersLog.append("Write Failure") self.writeSearchLog('./') raise self.searchParametersLog.append( "Success - %s: %d Tweets Written at %s" % (tsq.collection_name, len(tsq.buffer), str(datetime.now()))) #---Debug methods---- def getTweetsFromCollection(self, collection_name): tsq = self._findTsqFromCollectionName(collection_name) return tsq.buffer def firstTweetFromCollectionName(self, collection_name): tsq = self._findTsqFromCollectionName(collection_name) if (tsq): return tsq.buffer[0] def clearDBCollections(self): for col in self.DBController.getAllCollectionNames(): if (col == "searchlookup"): self.DBController.clearCollection(col) else: self.DBController.dropCollection(col) def writeSearchLog(self, path): with open(path + 'searches.log', mode='a', encoding='utf-8') as logfile: logfile.write('\n'.join( str(entry) for entry in self.searchParametersLog)) logfile.write('\n') self.searchParametersLog = [] #---Private methods---- def _readParamsFromDatabase(self, collection_names): # add the unique twitter search from database for name in collection_names: params = self.DBController.readSearchParamsFromCollectionName(name) self.tsqList.append(TwitterSearchQuery(self.ts, params)) self.searchParametersLog.append(params) def _moveResultsToDatabase(self, tsq): if (tsq.buffer): self.DBController.writeTweets(tsq.collection_name, tsq.buffer) self.DBController.writeSinceId(tsq.collection_name, tsq.getSinceId()) def _clearTsqList(self): self.tsqList = [] def _findTsqFromCollectionName(self, collection_name): for tsq in self.tsqList: if (tsq.collection_name == collection_name): return tsq return None def _updateQueriesAllowed(self): for tsq in self.tsqList: try: tsq.queriesAllowed = int( self.ts.get_metadata()['x-rate-limit-remaining']) / len( self.tsqList) except TwitterSearchException as e: if (e.code == 1012): tsq.queriesAllowed = 180 / len(self.tsqList) else: raise e
tso.set_include_entities(True) if UNTIL: tso.set_until(UNTIL) # it's about time to create a TwitterSearch object with our secret tokens ts = TwitterSearch(consumer_key = CONSUMER_KEY,consumer_secret = CONSUMER_SECRET,access_token = ACCESS_TOKEN,access_token_secret = ACCESS_TOKEN_SECRET,verify = True) #ts.authenticate() count = 0 # connect to mongo connection = pymongo.Connection("mongodb://{0}".format(DB_URL), safe=True) db=connection.twitter users = db.users tweets = db.tweets new_users = 0 response = ts.search_tweets_iterable(tso) for tweet in response: # this is where the fun actually starts :) limit_remaining = ts.get_metadata()['x-rate-limit-remaining'] limit_reset = ts.get_metadata()['x-rate-limit-reset'] limit = ts.get_metadata()['x-rate-limit-limit'] sleep = needs_sleep(limit_remaining,limit_reset) if sleep: print 'Sleeping {0} seconds to avoid reaching rate limit.'.format(sleep) time.sleep(sleep) tweet['twitteranalytics_project_id'] = PROJECT_ID if (tweets.find({"id":tweet['id'],"twitteranalytics_project_id":PROJECT_ID}).count() == 0): dt = datetime.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y') tweet['created_at_dt'] = dt if (START_DATE and END_DATE and dt >= START_DATE and dt <= END_DATE) or (not (START_DATE and END_DATE)): tweets.insert(tweet) else: print 'We reached our newest stored tweet: {0}'.format(tweet['text'].encode('utf-8')) #break