def startMining(self): try: tso = TwitterSearchOrder() # create a TwitterSearchOrder object tso.set_keywords( self.keywords ) # let's define all words we would like to have a look for # tso.set_language('de') # we want to see German tweets only tso.set_include_entities( False) # and don't give us all those entity information # it's about time to create a TwitterSearch object with our secret tokens ts = TwitterSearch(consumer_key=self.consumer_key, consumer_secret=self.consumer_secret, access_token=self.access_token, access_token_secret=self.access_token_secret) sleep_for = 60 # sleep for 60 seconds last_amount_of_queries = 0 # used to detect when new queries are done # this is where the fun actually starts :) for tweet in ts.search_tweets_iterable(tso): print('@%s tweeted: %s' % (tweet['user']['screen_name'], tweet['text'])) current_amount_of_queries = ts.get_statistics()[0] if not last_amount_of_queries == current_amount_of_queries: last_amount_of_queries = current_amount_of_queries time.sleep(sleep_for) except TwitterSearchException as e: # take care of all those ugly errors if there are some print(e)
def main(): """ Retrieve news tweet using Twitter Search API with filter for news. Important note: we downloaded TwitterSearch from https://github.com/ckoepp/TwitterSearch and added the filter to the search URL! This filter retrieves tweets from the requested date (or from now if left empty) that contain links to news web sites. """ args = docopt( """Retrieve news tweet using Twitter Search API with filter for news. Important note: we downloaded TwitterSearch from https://github.com/ckoepp/TwitterSearch and added the filter to the search URL! This filter retrieves tweets from the requested date (or from now if left empty) that contain links to news web sites. This script will save the tweets in a file named by the date they were created at. Usage: get_news_tweets_stream.py <consumer_key> <consumer_secret> <access_token> <access_token_secret> [<until>] Argumments: consumer_key Consumer key for the Twitter API consumer_secret Consumer secret for the Twitter API access_token Access key token for the Twitter API access_token_secret Access token secret for the Twitter API until (Optional): date in the format YYYY/MM/dd. Retrieve tweets only until this date. If this argument is not specified, it will retrieve current tweets. The Search API only supports up to one week ago. """) consumer_key = args['<consumer_key>'] consumer_secret = args['<consumer_secret>'] access_token = args['<access_token>'] access_token_secret = args['<access_token_secret>'] tso = TwitterSearchOrder() tso.set_keywords(['.']) tso.set_language('en') # Set until date if args['<until>']: year, month, day = map(int, args['-until'].split('/')) out_tweet_file = 'news_stream/tweets/%d_%02d_%02d' % (year, month, day - 1) tso.set_until(datetime.date(year, month, day)) else: out_tweet_file = 'news_stream/tweets/' + time.strftime('%Y_%m_%d') sleep_for = 10 last_amount_of_queries = 0 ts = TwitterSearch(consumer_key=consumer_key, consumer_secret=consumer_secret, access_token=access_token, access_token_secret=access_token_secret) tweets = set() with codecs.open(out_tweet_file, 'w', 'utf-8') as f_out: # Stop this manually while True: try: # Get the next batch of tweets for tweet in ts.search_tweets_iterable(tso): text = clean_tweet(tweet['text'].encode( sys.getdefaultencoding(), 'ignore').replace('\n', ' ')) if not text in tweets: print >> f_out, '\t'.join( (tweet['created_at'], str(tweet['id']), tweet['user']['screen_name'], text)) tweets.add(text) current_amount_of_queries = ts.get_statistics()[0] # Handle API rate limit if not last_amount_of_queries == current_amount_of_queries: last_amount_of_queries = current_amount_of_queries time.sleep(sleep_for) except TwitterSearchException as e: time.sleep(sleep_for) pass
from TwitterSearch import * from settings import * import time try: tso = TwitterSearchOrder() tso.set_keywords(['brexit', 'May']) ts = TwitterSearch( consumer_key = twitter_consumer_key, consumer_secret = twitter_consumer_secret, access_token = twitter_access_token, access_token_secret = twitter_access_token_secret ) sleep_for = 5 # sleep for 60 seconds last_amount_of_queries = 0 # used to detect when new queries are done for tweet in ts.search_tweets_iterable(tso): print tweet # print( '@%s tweeted: %s' % ( tweet['user']['screen_name'], tweet['text'] ) ) current_amount_of_queries = ts.get_statistics()[0] if not last_amount_of_queries == current_amount_of_queries: last_amount_of_queries = current_amount_of_queries time.sleep(sleep_for) except TwitterSearchException as e: print(e)
access_token_secret='qxvgonMEav4yHBKtdC8gXtOlI4S0VqlCveFpK7hmDu2hF') #for tweet in ts.search_tweets_iterable(tso): # print('@%s tweeted: %s' % (tweet['user']['screen_name'], tweet['text'])) #print(len(ts.search_tweets_iterable(tso))) sleep_for = 10 # sleep for 60 seconds last_amount_of_queries = 0 # used to detect when new queries are done analyzer = SentimentIntensityAnalyzer() for tweet in ts.search_tweets_iterable(tso): vs = analyzer.polarity_scores(tweet['text']) #print( '@%s tweeted: %s' % ( tweet['user']['screen_name'], tweet['text'] ) ) tweet_text = filterTweet(tweet['text']) #import pdb; pdb.set_trace() print("{:-<65} {}".format(tweet_text, str(vs))) #import pdb; pdb.set_trace() #queries, tweets_seen = current_ts_instance.get_statistics() current_amount_of_queries = ts.get_statistics()[0] if not last_amount_of_queries == current_amount_of_queries: last_amount_of_queries = current_amount_of_queries time.sleep(sleep_for) except TwitterSearchException as e: # take care of all those ugly errors if there are some print(e)
def twitSearch(tweetLastSeen): #print("Debug: In function twitSearch()") tweetSearchCount = 0 try: tso = TwitterSearchOrder() #tso.set_keywords(['disaster','banking'], or_operator = True) tso.set_keywords(['disaster','poverty','banking','homeless'], or_operator = True) #tso.add_keyword('poverty') #tso.add_keyword('disaster') #tso.add_keyword('banking') tso.set_language('en') tso.set_include_entities(False) tso.set_result_type('recent') if tweetLastSeen > 0: print("Debug: I have a previous value for lastseen_id, setting since_id() to: %s and asking for 100 results" % tweetLastSeen) tso.set_since_id(tweetLastSeen) tso.set_count(100) else: print("Debug: No value for lastseen_id, asking for one result") tso.set_count(1) print("Debug: The tso search string looks like this") print(tso.create_search_url()) ts = TwitterSearch( consumer_key = '', consumer_secret = '', access_token = '', access_token_secret = '') ## def my_callback_function(current_ts_instance): # accepts ONE argument: an instance of TwitterSearch ## #print("In callback function") ## queries, tweets_seen = current_ts_instance.get_statistics() ## #query = current_ts_instance.get_statistics() ## print("%s queries & %s tweets seen" %(queries, tweets_seen)) ## print("%s query" %(query)) ## #if queries > 0 and (queries % 5) == 0: # trigger delay every 5th query ## #print("Thats 5 queries. Sleeping for 60 secs") ## #time.sleep(60) # sleep for 60 seconds #queries, tweets_seen = ts.get_statistics() #print("Debug: %s queries & %s tweets seen" %(queries, tweets_seen)) #print("Debug: About to iterate over search results from TwitterSearch instance") #for tweet in ts.search_tweets_iterable(tso, callback=my_callback_function): tweets_seen = 0 currentTweetID = 0 lastTweetID = 0 for tweet in ts.search_tweets_iterable(tso): queries, tweets_seen_by_stats = ts.get_statistics() print("Debug: stats: %s queries & %s tweets seen" %(queries, tweets_seen_by_stats)) rateLimitRemaining = ts.get_metadata()['x-rate-limit-remaining'] rateLimitReset = ts.get_metadata()['X-Rate-Limit-Reset'] print("Debug: Rate limit resets at %s and has %s queries remaining" %(datetime.datetime.fromtimestamp(int(rateLimitReset)), rateLimitRemaining)) currentTweetID = tweet['id'] print("Debug: Current tweetID %s" % currentTweetID) if currentTweetID > lastTweetID: print("Debug: Seen a more recent tweetID, updating lastTweetID") lastTweetID = currentTweetID tweets_seen = tweets_seen_by_stats break print( 'Debug: In tweet iter @%s tweet id: %s' % ( tweet['user']['screen_name'], tweet['id'] ) ) tweets_seen = tweets_seen + 1 print("Debug: tweets_seens: %s" % tweets_seen) print('Debug: about to return tweet ID @%s' % lastTweetID ) global twitterSearchCount twitterSearchCount = twitterSearchCount + 1 print("Debug: This is twitter search number: %s" % twitterSearchCount) return lastTweetID, tweets_seen except TwitterSearchException as e: print(e)
def avoid_rate_limit(self, ts): # accepts ONE argument: an instance of TwitterSearch queries, tweets_seen = ts.get_statistics() if queries > 0 and (queries % 5) == 0: # trigger delay every 5th query time.sleep(30) # sleep for 60 seconds
last_num_queries = 0 retrieved_tweets = set() number_of_repeat_tweets = 0 try: tweetpkl = open('lol.pkl','wb') tweets = [] for tweet in ts.search_tweets_iterable(tso): tweets.append(tweet) if tweet['id'] in retrieved_tweets: print("tweet has already been retrieved") number_of_repeat_tweets += 1 else: retrieved_tweets.add(tweet['id']) cur_num_queries,tweets_recd = ts.get_statistics() if not last_num_queries == cur_num_queries: print("Queries done: %i. Tweets received: %i" % ts.get_statistics()) last_num_queries = cur_num_queries print("Sleeping for",sleep_for,"seconds") time.sleep(sleep_for) pickle.dump(tweets,tweetpkl) print("Wrote tweets to file",tweetpkl.name) print("number_of_repeat_tweets: ",number_of_repeat_tweets) tweetpkl.close() except TwitterSearchException as e: if e.code < 1000: print("HTTP status based exception: %i - %s" % (e.code, e.message)) else:
class JCrawler(object): 'Twitter crawler for Japanese tweets' tso = TwitterSearchOrder() ts = None con = None cursor = None DAILY_THRESHOLD = 50 IDLE_SLEEP_TIME = 60 total_api_queries = 0 queries_processed = 0 total_tweets_grabbed = 0 max_tweets = -1 global_min_id = -1 global_max_id = -1 depth = 10 seed_keywords_array = None logfile = None exact_tweet_re = re.compile('^RT @') queries_to_update = 0 consumer_key = '' consumer_secret = '' access_token = '' access_token_secret = '' def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret, seed_keywords_array=None, depth=1000, global_min_id=-1, global_max_id=-1, max_tweets=-1): # self.load_settings() self.depth = depth self.seed_keywords_array = seed_keywords_array self.global_max_id = global_max_id self.global_min_id = global_min_id self.max_tweets = max_tweets self.consumer_key = consumer_key self.consumer_secret = consumer_secret self.access_token = access_token self.access_token_secret = access_token_secret # Open the logfile self.logfile = codecs.open("log/tweetlog", 'a', encoding='utf-8') if self.max_tweets == -1: self.max_tweets = float("inf") # def load_settings(self, filepath=""): # if(filepath == ""): # filepath = "settings.cfg" # config = ConfigParser.RawConfigParser() # config.read(filepath) def log(self, message): # print(message) self.logfile.write(message) def get_old_queries_from_DB(self): self.checkConnection() db_query = u"""SELECT * FROM `queries` WHERE `next_update` <= %s AND total_updates > 0""" dict_cursor = self.con.cursor(pymysql.cursors.DictCursor) dict_cursor.execute(db_query, [int(time.time())]) self.queries_to_update += dict_cursor.rowcount # Get the results rows = dict_cursor.fetchall() # query_array = [] # for row in rows: # query_array.append(row) return rows def update_queries(self): query_array = self.get_old_queries_from_DB() for query in query_array: if(self.total_tweets_grabbed >= self.max_tweets): break if "min_id" in query: min_id = query["min_id"] else: min_id = None if "max_id" in query: max_id = query["max_id"] else: max_id = None num_tweets_fetched = scrape_search_query(query, self.depth, min_id, max_id) calculate_next_update(query, num_tweets_fetched) def get_new_queries_from_DB(self): self.checkConnection() db_query = u"""SELECT * FROM `queries` WHERE `next_update` <= %s AND total_updates = 0""" dict_cursor = self.con.cursor(pymysql.cursors.DictCursor) dict_cursor.execute(db_query, [int(time.time())]) self.queries_to_update += dict_cursor.rowcount # Get the results rows = dict_cursor.fetchall() return rows def process_new_queries(self, query_string_array=None): if(query_string_array is None): query_array = self.get_new_queries_from_DB() else: query_array = [] for string in query_string_array: query_array.append({'query': string}) for query in query_array: if(self.total_tweets_grabbed >= self.max_tweets): break num_tweets_fetched = self.scrape_search_query(query, self.depth) self.calculate_next_update(query, num_tweets_fetched) def connect_DB(self, host, dbuser, dbpass, dbname): try: conn = pymysql.connect(host, dbuser, dbpass, dbname, charset='utf8mb4', use_unicode=True, init_command="set names utf8mb4") testcursor = conn.cursor() testcursor.execute("SELECT VERSION()") results = testcursor.fetchone() # Check if anything at all is returned if results: self.log('Connected to DB, version: %s' % results) else: self.log('Connected to DB failed') # Disable binary logging to save space testcursor.execute("SET sql_log_bin = 0") results = testcursor.fetchone() testcursor.execute("SET NET_WRITE_TIMEOUT = 2147480") results = testcursor.fetchone() testcursor.execute("SET WAIT_TIMEOUT = 2147480") results = testcursor.fetchone() self.con = conn return conn except pymysql.Error as e: self.log("\nError %d: %s" % (e.args[0], e.args[1])) def start(self): try: # self.log('Connected to DB: %s' % self.con.info()) # while True: while self.total_tweets_grabbed < self.max_tweets: self.ReportStatus(1) self.log("\nUpdating old queries") self.update_queries() self.log("\nProcessing new queries") if(self.seed_keywords_array is not None): self.process_new_queries(self.seed_keywords_array) # Only process seed keywords once self.seed_keywords_array = None else: self.process_new_queries() # Keep script from wasting DB resources continually checking for queries to update if(self.queries_to_update == 0): self.ReportStatus(0) time.sleep(self.IDLE_SLEEP_TIME) # while AnalysisIsRunning(): # self.ReportStatus(0) # time.sleep(self.IDLE_SLEEP_TIME) if (not self.con.open): self.con = self.connect_DB('localhost', 'root', 'preyfromslyfluxlinus', 'jcrawler') #self.queries_to_update = 0 except TwitterSearchException as e: if(e.code == 420): self.log("\nBlaze it for a hot sec: we're being rate limited. Waiting 15 minutes...") self.ReportStatus(2) time.sleep(900) self.log("\nAttempting to restart scrape process...") return self.scrape_search_query(search_query, depth, min_id, max_id) elif(e.code == 403 or e.code == 503): self.log("\nTwitter is either experiencing high load or we are being rate limited. Waiting 1 hour...") self.ReportStatus(2) time.sleep(3600) self.log("\nAttempting to restart scrape process...") return self.scrape_search_query(search_query, depth, min_id, max_id) else: # self.log(e) logging.exception("TwitterSearch had an error:") print(e) except pymysql.Error as e: self.log("\nError %d: %s" % (e.args[0], e.args[1])) else: self.log("\nQuitting") if self.con: self.con.close() if self.logfile: self.logfile.close() def restart(self): if(self.con.open): self.con.close() self.cursor = None self.total_tweets_grabbed = 0 self.logfile = codecs.open("log/tweetlog", 'a', encoding='utf-8') self.start() def checkConnection(self): if(self.con is None or not self.con.open): self.con = self.connect_DB('localhost', 'root', 'preyfromslyfluxlinus', 'jcrawler') self.cursor = self.con.cursor() def ReportStatus(self, status, msg=""): 'Posts info about queries processed, currently processing, etc.' self.checkConnection() app = "jcrawler" if(status == 1): # Currently running variable = "processing" value = "1" elif(status == 0): # Done running variable = "processing" value = "0" elif(status == 2): # Sleeping variable = "processing" value = "2" controlQuery = u"""INSERT INTO `control`(`app`, `variable`, `value`) VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE value=VALUES(value)""" progressQuery = u"""INSERT INTO `progress`(`app`, `processed`) VALUES(%s, %s) """ reportcursor = self.con.cursor() reportcursor.execute(controlQuery, (app, variable, value)) reportcursor.execute(progressQuery, (app, self.total_tweets_grabbed)) self.con.commit() reportcursor.close() def AnalysisIsRunning(): self.checkConnection() isRunning = False statusQuery = u""" SELECT `value` FROM `control` WHERE `app`='lexicalyzer' AND `variable`='processing' """ statuscursor = self.con.cursor() statuscursor.execute(statusQuery) value = self.cursor.fetchone() if(value[0] > 0): isRunning = True else: isRunning = False statuscursor.close() return isRunning def add_queries_to_DB(self, query_array): self.checkConnection() for query_hash in query_array: # exist_stmt = u""" # SELECT * FROM `queries` # WHERE `sha256_hash` # =%s # """ query = query_array[query_hash] prepstatement = u"""INSERT INTO `queries`(`max_id`, `min_id`,`query`, `last_update`, `sha256_hash`, `next_update`, `penalty_level`, `total_tweets`) VALUES ( %s,%s,%s,%s,%s,%s,%s, %s) ON DUPLICATE KEY UPDATE max_id=VALUES(max_id), min_id=VALUES(min_id), last_update=VALUES(last_update) , next_update=VALUES(next_update), penalty_level= VALUES(penalty_level), total_tweets=VALUES(total_tweets)""" # self.log("\nQuery: %s" % query) self.cursor.execute(prepstatement, (query["max_id"], query["min_id"], query["query"], query["last_update"], query["sha256_hash"], query["next_update"], query["penalty_level"], query["total_tweets"])) def add_query_to_DB(self, query): prepstatement = u"""INSERT INTO `queries`(`max_id`, `min_id`,`query`, `last_update`, `sha256_hash`, `next_update`, `penalty_level`, `total_tweets`) VALUES ( %s,%s,%s,%s,%s,%s,%s, %s) ON DUPLICATE KEY UPDATE max_id=VALUES(max_id), min_id=VALUES(min_id), last_update=VALUES(last_update) , next_update=VALUES(next_update), penalty_level= VALUES(penalty_level), total_tweets=VALUES(total_tweets)""" self.cursor.execute(prepstatement, (query["max_id"], query["min_id"], query["query"], query["last_update"], query["sha256_hash"], query["next_update"], query["penalty_level"], query["total_tweets"])) def is_exact_retweet(self, tweet): return self.exact_tweet_re.match(tweet["text"]) or tweet["retweeted_status"]["text"] == tweet["text"] def gen_query_hash(self, query_text): return hashlib.sha256(query_text.encode("utf8")).hexdigest() def add_to_queries_list(self, query_text, tweet, queries_array): query_hash = self.gen_query_hash(query_text) if query_hash not in queries_array or queries_array[query_hash] is None: self.log("\nAdding new query to list") queries_array[query_hash] = {"query": query_text, "sha256_hash": query_hash, "max_id": tweet["id"], "min_id": tweet["id"], "next_update": int(time.time()), "penalty_level": 0, "total_tweets": 1, "last_update": int(time.time())} else: if(tweet["id"] >= queries_array[query_hash]["max_id"]): queries_array[query_hash]["max_id"] = tweet["id"] elif(tweet["id"] <= queries_array[query_hash]["min_id"]): queries_array[query_hash]["min_id"] = tweet["id"] queries_array[query_hash]["total_tweets"] += 1 queries_array[query_hash]["last_update"] = int(time.time()) def calculate_next_update(self, query, num_tweets_fetched): if num_tweets_fetched < self.DAILY_THRESHOLD: if("penalty_level" in query and query["penalty_level"] is not None): query["penalty_level"] += 1 else: query["penalty_level"] = 1 # Wait penalty_level days until next update query["next_update"] = int(time.time()) + (query["penalty_level"] * 24 * 60 * 60) self.log("\nPenalizing query by %s days. Next update: %s" % (query["penalty_level"], query["next_update"])) else: query["penalty_level"] = 0 # Wait one day query["next_update"] = int(time.time()) + (24 * 60 * 60) def scrape_search_query(self, search_query, depth, min_id=-1, max_id=-1): try: self.log("\nScraping...") # self.con = connect_DB('localhost', # 'root', # 'preyfromslyfluxlinus', # 'jcrawler') self.cursor = self.con.cursor() self.tso.set_keywords([search_query["query"]]) self.tso.set_language('ja') # search for japanese tweets only self.tso.set_locale('ja') if(self.global_min_id != -1): self.tso.set_since_id(self.global_min_id) elif(min_id != -1): self.tso.set_since_id(min_id) if(self.global_max_id != -1): self.tso.set_max_id(global_max_id) elif(max_id != -1): self.tso.set_max_id(max_id) sleep_time = 1 last_num_of_queries = 0 this_query_total_tweets = 0 new_queries = {} # it's about time to create a TwitterSearch object with our secret # tokens if self.ts is None: self.ts = TwitterSearch( consumer_key='lHw0Fte6wfJnzxMrA9nRxqJJN', consumer_secret='UQmsX0hC9wvuzhaMLZ4OpB' 'FqfI4vYdMrHSNt0FYEmFGcsYU0iK', access_token='4920693635-jk0qpriUrztwA2' 'a7dOwp4EhQI86qHt4xLbq7uPU', access_token_secret='3mTFWM8leIXQzaiqnH' 'hwbspN6BzB5O8qMYKDnrgqHCKBz' ) start = int(time.time()) for tweet in self.ts.search_tweets_iterable(self.tso): self.ReportStatus(1) if (('coordinates' in tweet and tweet["coordinates"] is not None and 'coordinates' in tweet["coordinates"] and tweet["coordinates"]["coordinates"] and tweet["coordinates"]["coordinates"][0] is not None)): has_coordinates = 1 longitude = tweet["coordinates"]["coordinates"][0] latitude = tweet["coordinates"]["coordinates"][1] else: has_coordinates = 0 longitude = 181 latitude = 91 if (('retweeted_status' in tweet and tweet["retweeted_status"] is not None and 'id' in tweet["retweeted_status"] and tweet["retweeted_status"]["id"] is not None)): is_retweet = 1 retweet_id_int = tweet["retweeted_status"]["id"] retweet_id_str = tweet["retweeted_status"]["id_str"] if(self.is_exact_retweet(tweet)): is_exact_retweet = 1 # Don't bother to save exact retweets anymore continue else: is_exact_retweet = 0 else: is_retweet = 0 retweet_id_int = -1 retweet_id_str = "" is_exact_retweet = 0 this_query_total_tweets += 1 self.total_tweets_grabbed += 1 # Write to DB # Prepare and execute raw_tweet query prepstatement = u"""INSERT INTO `raw_tweets`(`id`, `text` , `created_at`, `lang`, `retweet_count`, `source` , `user_id`, `has_coordinates`, `longitude` , `latitude`, `id_int`, `user_id_int`, `is_retweet` , `retweet_id_int`, `retweet_id`, `is_exact_retweet`) VALUES ( %s,%s,%s,%s,%s,%s,%s,%s, %s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE id=id""" self.cursor.execute(prepstatement, (tweet["id_str"], tweet["text"], tweet["created_at"], tweet["lang"], tweet["retweet_count"], tweet["source"], tweet["user"]["id_str"], has_coordinates, longitude, latitude, tweet["id"], tweet["user"]["id"], is_retweet, retweet_id_int, retweet_id_str, is_exact_retweet)) rawjsonstmt = u"""INSERT INTO `raw_tweets_json`(`id`, `id_int` , `raw_json`) VALUES ( %s,%s,%s) ON DUPLICATE KEY UPDATE id=id""" self.cursor.execute(rawjsonstmt, (tweet["id_str"], tweet["id"], json.dumps(tweet))) # Also store the user data user = tweet["user"] userstatement = u"""INSERT INTO `twitter_users`(`user_id`, `created_at`, `description` , `followers_count`, `friends_count`, `geo_enabled`, `lang` , `location` , `name`, `protected`, `screen_name`, `statuses_count` , `time_zone` , `url`, `utc_offset` , `verified`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s ,%s,%s) ON DUPLICATE KEY UPDATE `description`= VALUES(`description`) , `friends_count`=VALUES(`friends_count`) , `geo_enabled`=VALUES(`geo_enabled`) , `lang`=VALUES(`lang`) , `location`=VALUES(`location`) , `name`=VALUES(`name`) , `protected`=VALUES(`protected`) , `screen_name`=VALUES(`screen_name`) , `statuses_count`=VALUES(`statuses_count`) , `time_zone`=VALUES(`time_zone`) , `url`=VALUES(`url`) , `utc_offset`=VALUES(`utc_offset`) """ self.cursor.execute(userstatement, (user["id_str"], user["created_at"], user["description"], user["followers_count"], user["friends_count"], user["geo_enabled"], user["lang"], user["location"], user["name"], user["protected"], user["screen_name"], user["statuses_count"], user["time_zone"], user["url"], user["utc_offset"], user["verified"] )) hashtags = tweet["entities"]["hashtags"] for hashtag in hashtags: hashtagstatement = u""" INSERT INTO `hashtags`(`tweet_id` , `hashtag`, `start` , `end`) VALUES (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE tweet_id=tweet_id """ self.cursor.execute(hashtagstatement, ( tweet["id_str"], hashtag["text"], hashtag["indices"][0], hashtag["indices"][1] )) self.add_to_queries_list(hashtag["text"], tweet, new_queries) user_mentions = tweet["entities"]["user_mentions"] for mention in user_mentions: mentionstatement = u""" INSERT INTO `mentions`(`tweet_id` , `user_id` , `start` , `end`) VALUES (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE tweet_id=tweet_id """ self.cursor.execute(mentionstatement, ( tweet["id_str"], mention["id_str"], mention["indices"][0], mention["indices"][1] )) embedded_urls = tweet["entities"]["urls"] for url in embedded_urls: url_statement = u"""INSERT INTO `embedded_urls`( `tweet_id` , `url` , `start` , `end`) VALUES (%s,%s,%s,%s) ON DUPLICATE KEY UPDATE tweet_id=tweet_id """ self.cursor.execute(url_statement, ( tweet["id_str"], url["url"], url["indices"][0], url["indices"][1] )) # TODO: Investigate crash due tolack of x-rate-limit-remaining key if 'x-rate-limit-remaining' in self.ts.get_metadata(): self.log("\nCurrent rate-limiting status: %s" % self.ts.get_metadata()['x-rate-limit-remaining']) # self.log('\n@%s tweeted: %s' # % (tweet['user']['screen_name'], tweet['text'])) self.add_queries_to_DB(new_queries) # Execute queries for real self.con.commit() # if current_num_of_queries is different # from last_num_of_queries, # then a new page was loaded by TwitterSearch current_num_of_queries = self.ts.get_statistics()[0] if not last_num_of_queries == current_num_of_queries: # Calculate current query rate now = int(time.time()) if('x-rate-limit-remaining' in self.ts.get_metadata()): reset = int(self.ts.get_metadata()['x-rate-limit-reset']) else: reset = 0 if((now - start) == 0): rate = this_query_total_tweets else: rate = this_query_total_tweets / (now - start) self.log("\nCurrent API query rate: %s queries / s" % rate) self.log("\nCUrrent tweets processed: %s" % self.total_tweets_grabbed) # Stop with 3 queries left before hitting the limit # just to be safe if ('x-rate-limit-remaining' in self.ts.get_metadata() and int(self.ts.get_metadata()['x-rate-limit-remaining']) <= 3 and now < reset): longsleeptime = reset - now + 60 self.log( '\nSleeping %s seconds \n(x-rate-limit-remaining= %s,' ' \nx-rate-limit-reset=%s,' ' \ntime now=%s)' % ( longsleeptime, self.ts.get_metadata()[ 'x-rate-limit-remaining'], self.ts.get_metadata()['x-rate-limit-reset'], now ) ) self.ReportStatus(2) time.sleep(longsleeptime) if(this_query_total_tweets >= depth): break elif ('x-rate-limit-remaining' in self.ts.get_metadata() and int(self.ts.get_metadata()['x-rate-limit-remaining']) <= 3 and now >= reset): # Wait a minute just in case there is a discrepency # between the rate limit we've been given and the # actual self.log( '\nSleeping 60 seconds (x-rate-limit-remaining= %s,' ' x-rate-limit-reset=%s,' ' time now=%s)' % ( self.ts.get_metadata()[ 'x-rate-limit-remaining'], self.ts.get_metadata()['x-rate-limit-reset'], now ) ) self.ReportStatus(2) time.sleep(60) if(this_query_total_tweets >= depth): break elif('x-rate-limit-remaining' not in self.ts.get_metadata()): self.log("\nx-rate-limit-remaining missing! Will sleep.") self.ReportStatus(2) time.sleep(900) else: last_num_of_queries = self.ts.get_statistics()[0] # Wait between queries self.log( '\nSleeping \n(current_num_of_queries= %s,' ' \nlast_num_of_queries=%s' ', \nx-rate-limit-remaining= %s,' ' \nx-rate-limit-reset=%s)' % ( current_num_of_queries, last_num_of_queries, self.ts.get_metadata()[ 'x-rate-limit-remaining'], self.ts.get_metadata()['x-rate-limit-reset'] ) ) self.ReportStatus(2) time.sleep(sleep_time) if(this_query_total_tweets >= depth): break # Update query counter last_num_of_queries = self.ts.get_statistics()[0] self.log("\nThis query total tweets processed: %s" % this_query_total_tweets) return this_query_total_tweets except TwitterSearchException as e: if(e.code == 420): self.log("\nBlaze it for a hot sec: we're being rate limited. Waiting 15 minutes...") self.ReportStatus(2) time.sleep(900) self.log("\nAttempting to restart scrape process...") return self.scrape_search_query(search_query, depth, min_id, max_id) elif(e.code == 403 or e.code == 503): self.log("\nTwitter is either experiencing high load or we are being rate limited. Waiting 1 hour...") self.ReportStatus(2) time.sleep(3600) self.log("\nAttempting to restart scrape process...") return self.scrape_search_query(search_query, depth, min_id, max_id) else: # self.log(e) logging.exception("TwitterSearch had an error:") print(e) except pymysql.Error as e: self.log("\nError %d: %s" % (e.args[0], e.args[1])) if(e.args[0] == 2006): self.log("\nDetected dead MySQL server, waiting 30 seconds for server to restart...") self.ReportStatus(2) time.sleep(30) self.log("\nAttempting to restart scrape process...") return self.scrape_search_query(search_query, depth, min_id, max_id) else: self.log("\nQuitting") if con: con.close() if self.logfile: self.logfile.close() return this_query_total_tweets