def search(search):
    twitter_utils = twitter.Twitter()
    credentials = session.get('credentials')
    search = [search]
    try:
        tso = TwitterSearchOrder()
        tso.set_language('en')
        tso.set_keywords(search)
        tso.set_include_entities(False)  #Remove later if u want to use images
        query = TwitterSearch(consumer_key=session['consumer_key'],
                              consumer_secret=session['consumer_secret'],
                              access_token=session['token'],
                              access_token_secret=session['token_secret'])
        response = query.search_tweets(tso)
        t_range = datetime.now(pytz.utc) - timedelta(minutes=TIME_RANGE)
        tweets = [
            t for t in response['content']['statuses']
            if twitter_utils.get_date(t) >= t_range
        ]
        print("Current rate-limiting status: " +
              str(query.get_metadata()['x-rate-limit-reset']))
        return render_template(
            "page.html",
            search=search,
            tweets=tweets,
            music_config=twitter_utils.get_music_config(tweets))
    except TwitterSearchException as e:
        return str(e)
Exemple #2
0
def twitSearch(tweetLastSeen):
    #print("Debug: In function twitSearch()")
    tweetSearchCount = 0
    try:
        tso = TwitterSearchOrder()
        #tso.set_keywords(['disaster','banking'], or_operator = True)
        tso.set_keywords(['disaster','poverty','banking','homeless'], or_operator = True)
        #tso.add_keyword('poverty')
        #tso.add_keyword('disaster')
        #tso.add_keyword('banking')
        tso.set_language('en')
        tso.set_include_entities(False)
        tso.set_result_type('recent')

        if tweetLastSeen > 0:
            print("Debug: I have a previous value for lastseen_id, setting since_id() to: %s and asking for 100 results" % tweetLastSeen)
            tso.set_since_id(tweetLastSeen)
            tso.set_count(100)
        else:
            print("Debug: No value for lastseen_id, asking for one result")
            tso.set_count(1)

        print("Debug: The tso search string looks like this")
        print(tso.create_search_url())

        ts = TwitterSearch(
            consumer_key = '',
            consumer_secret = '',
            access_token = '',
            access_token_secret = '')

##        def my_callback_function(current_ts_instance): # accepts ONE argument: an instance of TwitterSearch
##            #print("In callback function")
##            queries, tweets_seen = current_ts_instance.get_statistics()
##            #query = current_ts_instance.get_statistics()
##            print("%s queries & %s tweets seen" %(queries, tweets_seen))
##            print("%s query" %(query))
##            #if queries > 0 and (queries % 5) == 0: # trigger delay every 5th query
##                #print("Thats 5 queries. Sleeping for 60 secs")
##                #time.sleep(60) # sleep for 60 seconds

        #queries, tweets_seen = ts.get_statistics()
        #print("Debug: %s queries & %s tweets seen" %(queries, tweets_seen))

        #print("Debug: About to iterate over search results from TwitterSearch instance")
        #for tweet in ts.search_tweets_iterable(tso, callback=my_callback_function):
                
        tweets_seen = 0        
        currentTweetID = 0
        lastTweetID = 0

        for tweet in ts.search_tweets_iterable(tso):    
            queries, tweets_seen_by_stats = ts.get_statistics()
            print("Debug: stats: %s queries & %s tweets seen" %(queries, tweets_seen_by_stats))
            rateLimitRemaining = ts.get_metadata()['x-rate-limit-remaining']
            rateLimitReset = ts.get_metadata()['X-Rate-Limit-Reset']
            print("Debug: Rate limit resets at %s and has %s queries remaining" %(datetime.datetime.fromtimestamp(int(rateLimitReset)), rateLimitRemaining))
            currentTweetID = tweet['id']
            print("Debug: Current tweetID %s" % currentTweetID)
            if currentTweetID > lastTweetID:
                print("Debug: Seen a more recent tweetID, updating lastTweetID")
                lastTweetID = currentTweetID
                tweets_seen = tweets_seen_by_stats
                break
            print( 'Debug: In tweet iter @%s tweet id: %s' % ( tweet['user']['screen_name'], tweet['id'] ) )
            tweets_seen = tweets_seen + 1
            print("Debug: tweets_seens: %s" % tweets_seen)
            
        print('Debug: about to return tweet ID @%s' % lastTweetID )
        global twitterSearchCount
        twitterSearchCount = twitterSearchCount + 1
        print("Debug: This is twitter search number: %s" % twitterSearchCount)
        
        return lastTweetID, tweets_seen

    except TwitterSearchException as e:
        print(e)
Exemple #3
0
class JCrawler(object):
    'Twitter crawler for Japanese tweets'

    tso = TwitterSearchOrder()
    ts = None
    con = None
    cursor = None
    DAILY_THRESHOLD = 50
    IDLE_SLEEP_TIME = 60
    total_api_queries = 0
    queries_processed = 0
    total_tweets_grabbed = 0
    max_tweets = -1
    global_min_id = -1
    global_max_id = -1
    depth = 10
    seed_keywords_array = None
    logfile = None
    exact_tweet_re = re.compile('^RT @')
    queries_to_update = 0

    consumer_key = ''
    consumer_secret = ''
    access_token = ''
    access_token_secret = ''

    def __init__(self, consumer_key, consumer_secret, access_token, access_token_secret, seed_keywords_array=None, depth=1000,
                 global_min_id=-1, global_max_id=-1, max_tweets=-1):
        # self.load_settings()
        self.depth = depth
        self.seed_keywords_array = seed_keywords_array
        self.global_max_id = global_max_id
        self.global_min_id = global_min_id
        self.max_tweets = max_tweets
        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.access_token = access_token
        self.access_token_secret = access_token_secret

        # Open the logfile
        self.logfile = codecs.open("log/tweetlog", 'a', encoding='utf-8')

        if self.max_tweets == -1:
            self.max_tweets = float("inf")

    # def load_settings(self, filepath=""):
        # if(filepath == ""):
        #    filepath = "settings.cfg"

        # config = ConfigParser.RawConfigParser()
        # config.read(filepath)

    def log(self, message):
        # print(message)
        self.logfile.write(message)

    def get_old_queries_from_DB(self):
        self.checkConnection()
        db_query = u"""SELECT * FROM `queries` WHERE
                         `next_update` <= %s AND total_updates > 0"""
        dict_cursor = self.con.cursor(pymysql.cursors.DictCursor)
        dict_cursor.execute(db_query,
                            [int(time.time())])

        self.queries_to_update += dict_cursor.rowcount
        # Get the results
        rows = dict_cursor.fetchall()
        # query_array = []
        # for row in rows:
        #     query_array.append(row)

        return rows

    def update_queries(self):
        query_array = self.get_old_queries_from_DB()

        for query in query_array:
            if(self.total_tweets_grabbed >= self.max_tweets):
                break
            if "min_id" in query:
                min_id = query["min_id"]
            else:
                min_id = None

            if "max_id" in query:
                max_id = query["max_id"]
            else:
                max_id = None

            num_tweets_fetched = scrape_search_query(query, self.depth, min_id, max_id)
            calculate_next_update(query, num_tweets_fetched)

    def get_new_queries_from_DB(self):
        self.checkConnection()
        db_query = u"""SELECT * FROM `queries`
                         WHERE `next_update` <= %s AND total_updates = 0"""
        dict_cursor = self.con.cursor(pymysql.cursors.DictCursor)
        dict_cursor.execute(db_query,
                            [int(time.time())])
        self.queries_to_update += dict_cursor.rowcount
        # Get the results
        rows = dict_cursor.fetchall()

        return rows

    def process_new_queries(self, query_string_array=None):
        if(query_string_array is None):
            query_array = self.get_new_queries_from_DB()
        else:
            query_array = []
            for string in query_string_array:
                query_array.append({'query': string})

        for query in query_array:
            if(self.total_tweets_grabbed >= self.max_tweets):
                break
            num_tweets_fetched = self.scrape_search_query(query, self.depth)
            self.calculate_next_update(query, num_tweets_fetched)

    def connect_DB(self, host, dbuser, dbpass, dbname):
        try:
            conn = pymysql.connect(host,
                                   dbuser,
                                   dbpass,
                                   dbname,
                                   charset='utf8mb4',
                                   use_unicode=True,
                                   init_command="set names utf8mb4")

            testcursor = conn.cursor()
            testcursor.execute("SELECT VERSION()")
            results = testcursor.fetchone()
            # Check if anything at all is returned
            if results:
                self.log('Connected to DB, version: %s' % results)
            else:
                self.log('Connected to DB failed')

            # Disable binary logging to save space
            testcursor.execute("SET sql_log_bin = 0")
            results = testcursor.fetchone()
            testcursor.execute("SET NET_WRITE_TIMEOUT = 2147480")
            results = testcursor.fetchone()
            testcursor.execute("SET WAIT_TIMEOUT = 2147480")
            results = testcursor.fetchone()

            self.con = conn

            return conn

        except pymysql.Error as e:
            self.log("\nError %d: %s" % (e.args[0], e.args[1]))

    def start(self):
        try:
            # self.log('Connected to DB: %s' % self.con.info())
            # while True:
            while self.total_tweets_grabbed < self.max_tweets:
                self.ReportStatus(1)
                self.log("\nUpdating old queries")
                self.update_queries()
                self.log("\nProcessing new queries")
                if(self.seed_keywords_array is not None):
                    self.process_new_queries(self.seed_keywords_array)
                    # Only process seed keywords once
                    self.seed_keywords_array = None
                else:
                    self.process_new_queries()
                    # Keep script from wasting DB resources continually checking for queries to update
                    if(self.queries_to_update == 0):
                        self.ReportStatus(0)
                        time.sleep(self.IDLE_SLEEP_TIME)
            # while AnalysisIsRunning():
                # self.ReportStatus(0)
                # time.sleep(self.IDLE_SLEEP_TIME)

                if (not self.con.open):
                    self.con = self.connect_DB('localhost',
                                               'root',
                                               'preyfromslyfluxlinus',
                                               'jcrawler')

                #self.queries_to_update = 0
        except TwitterSearchException as e:
            if(e.code == 420):
                self.log("\nBlaze it for a hot sec: we're being rate limited. Waiting 15 minutes...")
                self.ReportStatus(2)
                time.sleep(900)
                self.log("\nAttempting to restart scrape process...")
                return self.scrape_search_query(search_query, depth, min_id, max_id)
            elif(e.code == 403 or e.code == 503):
                self.log("\nTwitter is either experiencing high load or we are being rate limited. Waiting 1 hour...")
                self.ReportStatus(2)
                time.sleep(3600)
                self.log("\nAttempting to restart scrape process...")
                return self.scrape_search_query(search_query, depth, min_id, max_id)
            else:
                # self.log(e)
                logging.exception("TwitterSearch had an error:")
                print(e)
        except pymysql.Error as e:
            self.log("\nError %d: %s" % (e.args[0], e.args[1]))
        else:
            self.log("\nQuitting")
            if self.con:
                self.con.close()
            if self.logfile:
                self.logfile.close()

    def restart(self):
        if(self.con.open):
            self.con.close()
        self.cursor = None
        self.total_tweets_grabbed = 0
        self.logfile = codecs.open("log/tweetlog", 'a', encoding='utf-8')
        self.start()

    def checkConnection(self):
        if(self.con is None or not self.con.open):
            self.con = self.connect_DB('localhost',
                                       'root',
                                       'preyfromslyfluxlinus',
                                       'jcrawler')
            self.cursor = self.con.cursor()

    def ReportStatus(self, status, msg=""):
        'Posts info about queries processed, currently processing, etc.'
        self.checkConnection()
        app = "jcrawler"
        if(status == 1):  # Currently running
            variable = "processing"
            value = "1"
        elif(status == 0):  # Done running
            variable = "processing"
            value = "0"
        elif(status == 2):
            # Sleeping
            variable = "processing"
            value = "2"

        controlQuery = u"""INSERT INTO `control`(`app`, `variable`, `value`) VALUES(%s,%s,%s) 
        ON DUPLICATE KEY UPDATE value=VALUES(value)"""
        progressQuery = u"""INSERT INTO `progress`(`app`, `processed`) VALUES(%s, %s)
        """
        reportcursor = self.con.cursor()
        reportcursor.execute(controlQuery, (app, variable, value))
        reportcursor.execute(progressQuery, (app, self.total_tweets_grabbed))
        self.con.commit()
        reportcursor.close()

    def AnalysisIsRunning():
        self.checkConnection()
        isRunning = False
        statusQuery = u"""
        SELECT `value` FROM `control` WHERE `app`='lexicalyzer' AND `variable`='processing'
        """
        statuscursor = self.con.cursor()
        statuscursor.execute(statusQuery)
        value = self.cursor.fetchone()
        if(value[0] > 0):
            isRunning = True
        else:
            isRunning = False
        statuscursor.close()
        return isRunning

    def add_queries_to_DB(self, query_array):
        self.checkConnection()
        for query_hash in query_array:
            # exist_stmt = u"""
            # SELECT * FROM `queries`
            # WHERE `sha256_hash`
            # =%s
            # """
            query = query_array[query_hash]
            prepstatement = u"""INSERT INTO `queries`(`max_id`,
            `min_id`,`query`, `last_update`, `sha256_hash`,
            `next_update`, `penalty_level`,
            `total_tweets`) VALUES (
                    %s,%s,%s,%s,%s,%s,%s,
                    %s) ON DUPLICATE KEY UPDATE max_id=VALUES(max_id),
                    min_id=VALUES(min_id), last_update=VALUES(last_update)
                    , next_update=VALUES(next_update), penalty_level=
                    VALUES(penalty_level), total_tweets=VALUES(total_tweets)"""
            # self.log("\nQuery: %s" % query)
            self.cursor.execute(prepstatement,
                                (query["max_id"],
                                 query["min_id"],
                                 query["query"],
                                 query["last_update"],
                                 query["sha256_hash"],
                                 query["next_update"],
                                 query["penalty_level"],
                                 query["total_tweets"]))

    def add_query_to_DB(self, query):
        prepstatement = u"""INSERT INTO `queries`(`max_id`,
        `min_id`,`query`, `last_update`, `sha256_hash`,
        `next_update`, `penalty_level`,
        `total_tweets`) VALUES (
                %s,%s,%s,%s,%s,%s,%s,
                %s) ON DUPLICATE KEY UPDATE max_id=VALUES(max_id),
                min_id=VALUES(min_id), last_update=VALUES(last_update)
                , next_update=VALUES(next_update), penalty_level=
                VALUES(penalty_level), total_tweets=VALUES(total_tweets)"""

        self.cursor.execute(prepstatement,
                            (query["max_id"],
                             query["min_id"],
                             query["query"],
                             query["last_update"],
                             query["sha256_hash"],
                             query["next_update"],
                             query["penalty_level"],
                             query["total_tweets"]))

    def is_exact_retweet(self, tweet):
        return self.exact_tweet_re.match(tweet["text"]) or tweet["retweeted_status"]["text"] == tweet["text"]

    def gen_query_hash(self, query_text):
        return hashlib.sha256(query_text.encode("utf8")).hexdigest()

    def add_to_queries_list(self, query_text, tweet, queries_array):
        query_hash = self.gen_query_hash(query_text)
        if query_hash not in queries_array or queries_array[query_hash] is None:
            self.log("\nAdding new query to list")
            queries_array[query_hash] = {"query": query_text,
                                         "sha256_hash": query_hash,
                                         "max_id": tweet["id"],
                                         "min_id": tweet["id"],
                                         "next_update": int(time.time()),
                                         "penalty_level": 0,
                                         "total_tweets": 1,
                                         "last_update": int(time.time())}
        else:
            if(tweet["id"] >= queries_array[query_hash]["max_id"]):
                queries_array[query_hash]["max_id"] = tweet["id"]
            elif(tweet["id"] <= queries_array[query_hash]["min_id"]):
                queries_array[query_hash]["min_id"] = tweet["id"]
            queries_array[query_hash]["total_tweets"] += 1
            queries_array[query_hash]["last_update"] = int(time.time())

    def calculate_next_update(self, query, num_tweets_fetched):
        if num_tweets_fetched < self.DAILY_THRESHOLD:
            if("penalty_level" in query and query["penalty_level"] is not None):
                query["penalty_level"] += 1
            else:
                query["penalty_level"] = 1
            # Wait penalty_level days until next update
            query["next_update"] = int(time.time()) + (query["penalty_level"] * 24 * 60 * 60)
            self.log("\nPenalizing query by %s days. Next update: %s" % (query["penalty_level"], query["next_update"]))
        else:
            query["penalty_level"] = 0
            # Wait one day
            query["next_update"] = int(time.time()) + (24 * 60 * 60)

    def scrape_search_query(self, search_query, depth, min_id=-1, max_id=-1):
        try:
            self.log("\nScraping...")
            # self.con = connect_DB('localhost',
            #                       'root',
            #                       'preyfromslyfluxlinus',
            #                       'jcrawler')
            self.cursor = self.con.cursor()
            self.tso.set_keywords([search_query["query"]])
            self.tso.set_language('ja')  # search for japanese tweets only
            self.tso.set_locale('ja')

            if(self.global_min_id != -1):
                self.tso.set_since_id(self.global_min_id)
            elif(min_id != -1):
                self.tso.set_since_id(min_id)

            if(self.global_max_id != -1):
                self.tso.set_max_id(global_max_id)
            elif(max_id != -1):
                self.tso.set_max_id(max_id)

            sleep_time = 1
            last_num_of_queries = 0
            this_query_total_tweets = 0

            new_queries = {}

            # it's about time to create a TwitterSearch object with our secret
            # tokens
            if self.ts is None:
                self.ts = TwitterSearch(
                    consumer_key='lHw0Fte6wfJnzxMrA9nRxqJJN',
                    consumer_secret='UQmsX0hC9wvuzhaMLZ4OpB'
                    'FqfI4vYdMrHSNt0FYEmFGcsYU0iK',
                    access_token='4920693635-jk0qpriUrztwA2'
                    'a7dOwp4EhQI86qHt4xLbq7uPU',
                    access_token_secret='3mTFWM8leIXQzaiqnH'
                    'hwbspN6BzB5O8qMYKDnrgqHCKBz'
                )

            start = int(time.time())

            for tweet in self.ts.search_tweets_iterable(self.tso):
                self.ReportStatus(1)
                if (('coordinates' in tweet and
                     tweet["coordinates"] is not None and
                     'coordinates' in tweet["coordinates"] and
                     tweet["coordinates"]["coordinates"] and
                     tweet["coordinates"]["coordinates"][0] is not None)):
                    has_coordinates = 1
                    longitude = tweet["coordinates"]["coordinates"][0]
                    latitude = tweet["coordinates"]["coordinates"][1]
                else:
                    has_coordinates = 0
                    longitude = 181
                    latitude = 91

                if (('retweeted_status' in tweet and
                     tweet["retweeted_status"] is not None and
                     'id' in tweet["retweeted_status"] and
                     tweet["retweeted_status"]["id"] is not None)):
                    is_retweet = 1
                    retweet_id_int = tweet["retweeted_status"]["id"]
                    retweet_id_str = tweet["retweeted_status"]["id_str"]
                    if(self.is_exact_retweet(tweet)):
                        is_exact_retweet = 1
                        # Don't bother to save exact retweets anymore
                        continue
                    else:
                        is_exact_retweet = 0
                else:
                    is_retweet = 0
                    retweet_id_int = -1
                    retweet_id_str = ""
                    is_exact_retweet = 0

                this_query_total_tweets += 1
                self.total_tweets_grabbed += 1

                # Write to DB

                # Prepare and execute raw_tweet query
                prepstatement = u"""INSERT INTO `raw_tweets`(`id`, `text`
                , `created_at`, `lang`, `retweet_count`, `source`
                , `user_id`, `has_coordinates`, `longitude`
                , `latitude`, `id_int`, `user_id_int`, `is_retweet`
                , `retweet_id_int`, `retweet_id`, `is_exact_retweet`) VALUES (
                %s,%s,%s,%s,%s,%s,%s,%s,
                %s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE id=id"""

                self.cursor.execute(prepstatement,
                                    (tweet["id_str"],
                                     tweet["text"],
                                     tweet["created_at"],
                                     tweet["lang"],
                                     tweet["retweet_count"],
                                     tweet["source"],
                                     tweet["user"]["id_str"],
                                     has_coordinates,
                                     longitude,
                                     latitude,
                                     tweet["id"],
                                     tweet["user"]["id"],
                                     is_retweet,
                                     retweet_id_int,
                                     retweet_id_str,
                                     is_exact_retweet))

                rawjsonstmt = u"""INSERT INTO `raw_tweets_json`(`id`, `id_int`
                , `raw_json`) VALUES (
                %s,%s,%s) ON DUPLICATE KEY UPDATE id=id"""

                self.cursor.execute(rawjsonstmt,
                                    (tweet["id_str"],
                                     tweet["id"],
                                     json.dumps(tweet)))

                # Also store the user data
                user = tweet["user"]
                userstatement = u"""INSERT INTO `twitter_users`(`user_id`, `created_at`, `description`
                , `followers_count`, `friends_count`, `geo_enabled`, `lang`
                , `location`
                , `name`, `protected`, `screen_name`, `statuses_count`
                , `time_zone`
                , `url`, `utc_offset`
                , `verified`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s
                ,%s,%s) ON DUPLICATE KEY UPDATE `description`=
                VALUES(`description`)
                , `friends_count`=VALUES(`friends_count`)
                , `geo_enabled`=VALUES(`geo_enabled`)
                , `lang`=VALUES(`lang`)
                , `location`=VALUES(`location`)
                , `name`=VALUES(`name`)
                , `protected`=VALUES(`protected`)
                , `screen_name`=VALUES(`screen_name`)
                , `statuses_count`=VALUES(`statuses_count`)
                , `time_zone`=VALUES(`time_zone`)
                , `url`=VALUES(`url`)
                , `utc_offset`=VALUES(`utc_offset`)
                """

                self.cursor.execute(userstatement,
                                    (user["id_str"],
                                     user["created_at"],
                                        user["description"],
                                        user["followers_count"],
                                        user["friends_count"],
                                        user["geo_enabled"],
                                        user["lang"],
                                        user["location"],
                                        user["name"],
                                        user["protected"],
                                        user["screen_name"],
                                        user["statuses_count"],
                                        user["time_zone"],
                                        user["url"],
                                        user["utc_offset"],
                                        user["verified"]
                                     ))

                hashtags = tweet["entities"]["hashtags"]

                for hashtag in hashtags:
                    hashtagstatement = u""" INSERT INTO `hashtags`(`tweet_id`
                    , `hashtag`, `start`
                    , `end`) VALUES (%s,%s,%s,%s)
                     ON DUPLICATE KEY UPDATE tweet_id=tweet_id
                    """
                    self.cursor.execute(hashtagstatement, (
                        tweet["id_str"],
                        hashtag["text"],
                        hashtag["indices"][0],
                        hashtag["indices"][1]
                    ))
                    self.add_to_queries_list(hashtag["text"], tweet, new_queries)

                user_mentions = tweet["entities"]["user_mentions"]
                for mention in user_mentions:
                    mentionstatement = u""" INSERT INTO `mentions`(`tweet_id`
                    , `user_id`
                    , `start`
                    , `end`) VALUES (%s,%s,%s,%s)
                     ON DUPLICATE KEY UPDATE tweet_id=tweet_id
                     """
                    self.cursor.execute(mentionstatement, (
                        tweet["id_str"],
                        mention["id_str"],
                        mention["indices"][0],
                        mention["indices"][1]
                    ))

                embedded_urls = tweet["entities"]["urls"]
                for url in embedded_urls:
                    url_statement = u"""INSERT INTO `embedded_urls`(
                    `tweet_id`
                    , `url`
                    , `start`
                    , `end`)
                     VALUES (%s,%s,%s,%s)
                     ON DUPLICATE KEY UPDATE tweet_id=tweet_id
                    """
                    self.cursor.execute(url_statement, (
                        tweet["id_str"],
                        url["url"],
                        url["indices"][0],
                        url["indices"][1]
                    ))

                # TODO: Investigate crash due tolack of x-rate-limit-remaining key
                if 'x-rate-limit-remaining' in self.ts.get_metadata():
                    self.log("\nCurrent rate-limiting status: %s" %
                             self.ts.get_metadata()['x-rate-limit-remaining'])

                # self.log('\n@%s tweeted: %s'
                # % (tweet['user']['screen_name'], tweet['text']))

                self.add_queries_to_DB(new_queries)

                # Execute queries for real
                self.con.commit()

                # if current_num_of_queries is different
                # from last_num_of_queries,
                # then a new page was loaded by TwitterSearch
                current_num_of_queries = self.ts.get_statistics()[0]
                if not last_num_of_queries == current_num_of_queries:
                    # Calculate current query rate
                    now = int(time.time())
                    if('x-rate-limit-remaining' in self.ts.get_metadata()):
                        reset = int(self.ts.get_metadata()['x-rate-limit-reset'])
                    else:
                        reset = 0

                    if((now - start) == 0):
                        rate = this_query_total_tweets
                    else:
                        rate = this_query_total_tweets / (now - start)

                    self.log("\nCurrent API query rate: %s queries / s" % rate)
                    self.log("\nCUrrent tweets processed: %s" % self.total_tweets_grabbed)

                    # Stop with 3 queries left before hitting the limit
                    # just to be safe
                    if ('x-rate-limit-remaining' in self.ts.get_metadata() and
                            int(self.ts.get_metadata()['x-rate-limit-remaining']) <= 3 and
                            now < reset):
                        longsleeptime = reset - now + 60
                        self.log(
                            '\nSleeping %s seconds \n(x-rate-limit-remaining= %s,'
                            ' \nx-rate-limit-reset=%s,'
                            ' \ntime now=%s)'
                            % (
                                longsleeptime,
                                self.ts.get_metadata()[
                                    'x-rate-limit-remaining'],
                                self.ts.get_metadata()['x-rate-limit-reset'],
                                now
                            )
                        )
                        self.ReportStatus(2)
                        time.sleep(longsleeptime)
                        if(this_query_total_tweets >= depth):
                            break
                    elif ('x-rate-limit-remaining' in self.ts.get_metadata() and
                          int(self.ts.get_metadata()['x-rate-limit-remaining']) <= 3 and
                          now >= reset):
                        # Wait a minute just in case there is a discrepency
                        # between the rate limit we've been given and the
                        # actual
                        self.log(
                            '\nSleeping 60 seconds (x-rate-limit-remaining= %s,'
                            ' x-rate-limit-reset=%s,'
                            ' time now=%s)'
                            % (
                                self.ts.get_metadata()[
                                    'x-rate-limit-remaining'],
                                self.ts.get_metadata()['x-rate-limit-reset'],
                                now
                            )
                        )
                        self.ReportStatus(2)
                        time.sleep(60)
                        if(this_query_total_tweets >= depth):
                            break
                    elif('x-rate-limit-remaining' not in self.ts.get_metadata()):
                        self.log("\nx-rate-limit-remaining missing! Will sleep.")
                        self.ReportStatus(2)
                        time.sleep(900)
                    else:
                        last_num_of_queries = self.ts.get_statistics()[0]
                        # Wait between queries
                        self.log(
                            '\nSleeping \n(current_num_of_queries= %s,'
                            ' \nlast_num_of_queries=%s'
                            ', \nx-rate-limit-remaining= %s,'
                            ' \nx-rate-limit-reset=%s)'
                            % (
                                current_num_of_queries,
                                last_num_of_queries,
                                self.ts.get_metadata()[
                                    'x-rate-limit-remaining'],
                                self.ts.get_metadata()['x-rate-limit-reset']
                            )
                        )
                        self.ReportStatus(2)
                        time.sleep(sleep_time)
                        if(this_query_total_tweets >= depth):
                            break

                # Update query counter
                last_num_of_queries = self.ts.get_statistics()[0]

            self.log("\nThis query total tweets processed: %s" % this_query_total_tweets)
            return this_query_total_tweets
        except TwitterSearchException as e:
            if(e.code == 420):
                self.log("\nBlaze it for a hot sec: we're being rate limited. Waiting 15 minutes...")
                self.ReportStatus(2)
                time.sleep(900)
                self.log("\nAttempting to restart scrape process...")
                return self.scrape_search_query(search_query, depth, min_id, max_id)
            elif(e.code == 403 or e.code == 503):
                self.log("\nTwitter is either experiencing high load or we are being rate limited. Waiting 1 hour...")
                self.ReportStatus(2)
                time.sleep(3600)
                self.log("\nAttempting to restart scrape process...")
                return self.scrape_search_query(search_query, depth, min_id, max_id)
            else:
                # self.log(e)
                logging.exception("TwitterSearch had an error:")
                print(e)
        except pymysql.Error as e:
            self.log("\nError %d: %s" % (e.args[0], e.args[1]))
            if(e.args[0] == 2006):
                self.log("\nDetected dead MySQL server, waiting 30 seconds for server to restart...")
                self.ReportStatus(2)
                time.sleep(30)
                self.log("\nAttempting to restart scrape process...")
                return self.scrape_search_query(search_query, depth, min_id, max_id)
        else:
            self.log("\nQuitting")
            if con:
                con.close()
            if self.logfile:
                self.logfile.close()
            return this_query_total_tweets
ts = TwitterSearch(consumer_key=keys[0],
                   consumer_secret=keys[1],
                   access_token=keys[2],
                   access_token_secret=keys[3])
pp = pprint.PrettyPrinter(indent=4)

latest_id = 0
num_tweets = 0
next_max_id = 0
try:
    for i in range(iters):
        # first query the Twitter API
        response = ts.search_tweets(tso)

        # print rate limiting status
        print "Current api calls remaining: %s" % ts.get_metadata(
        )['x-rate-limit-remaining']
        old_num_tweets = num_tweets
        # check all tweets according to their ID
        for tweet in response['content']['statuses']:
            num_tweets += 1
            tweet_id = tweet['id']

            with open(args.out_file, 'a') as fout:
                json.dump(tweet, fout)
                fout.write('\n')
                #pprint.pprint(tweet,stream=fout)

            if (latest_id == 0):
                latest_id = tweet_id
            # current ID is lower than current next_max_id?
            if (tweet_id < next_max_id) or (next_max_id == 0):
Exemple #5
0
class RestController:
    def __init__(self, config):
        self.config = config
        self.ts = TwitterSearch(
            consumer_key=config['TwitterAuth']['consumer_key'],
            consumer_secret=config['TwitterAuth']['consumer_secret'],
            access_token=config['TwitterAuth']['access_token'],
            access_token_secret=config['TwitterAuth']['access_token_secret'])
        self.searchParametersLog = []
        self.tsqList = []
        self.DBController = SearchDBController(config)

    #---Main public methods----
    def addNewSearchParams(self, searchParams):
        # if search parameters are unique and complete, add them to the database search controller
        self.DBController.addSearchParams(searchParams)

    def basicSearch(self, collection_names):
        self._clearTsqList()
        self._readParamsFromDatabase(collection_names)
        self._updateQueriesAllowed()
        for tsq in self.tsqList:
            if tsq.queriesAllowed == 0:
                self.searchParametersLog.append("Failure")
                print("Not enough queries remaining")
                break
            try:
                tsq.performSearch()
                self._moveResultsToDatabase(tsq)
            except TwitterSearchException as e:
                self.searchParametersLog.append("TwitterSearch Failure")
                self.writeSearchLog('./')
                raise e
            except:
                self.searchParametersLog.append("Write Failure")
                self.writeSearchLog('./')
                raise
            self.searchParametersLog.append(
                "Success - %s: %d Tweets Written at %s" %
                (tsq.collection_name, len(tsq.buffer), str(datetime.now())))

    #---Debug methods----
    def getTweetsFromCollection(self, collection_name):
        tsq = self._findTsqFromCollectionName(collection_name)
        return tsq.buffer

    def firstTweetFromCollectionName(self, collection_name):
        tsq = self._findTsqFromCollectionName(collection_name)
        if (tsq):
            return tsq.buffer[0]

    def clearDBCollections(self):
        for col in self.DBController.getAllCollectionNames():
            if (col == "searchlookup"):
                self.DBController.clearCollection(col)
            else:
                self.DBController.dropCollection(col)

    def writeSearchLog(self, path):
        with open(path + 'searches.log', mode='a',
                  encoding='utf-8') as logfile:
            logfile.write('\n'.join(
                str(entry) for entry in self.searchParametersLog))
            logfile.write('\n')
        self.searchParametersLog = []

    #---Private methods----
    def _readParamsFromDatabase(self, collection_names):
        # add the unique twitter search from database
        for name in collection_names:
            params = self.DBController.readSearchParamsFromCollectionName(name)
            self.tsqList.append(TwitterSearchQuery(self.ts, params))
            self.searchParametersLog.append(params)

    def _moveResultsToDatabase(self, tsq):
        if (tsq.buffer):
            self.DBController.writeTweets(tsq.collection_name, tsq.buffer)
            self.DBController.writeSinceId(tsq.collection_name,
                                           tsq.getSinceId())

    def _clearTsqList(self):
        self.tsqList = []

    def _findTsqFromCollectionName(self, collection_name):
        for tsq in self.tsqList:
            if (tsq.collection_name == collection_name):
                return tsq
        return None

    def _updateQueriesAllowed(self):
        for tsq in self.tsqList:
            try:
                tsq.queriesAllowed = int(
                    self.ts.get_metadata()['x-rate-limit-remaining']) / len(
                        self.tsqList)
            except TwitterSearchException as e:
                if (e.code == 1012):
                    tsq.queriesAllowed = 180 / len(self.tsqList)
                else:
                    raise e
 tso.set_include_entities(True)
 if UNTIL:
     tso.set_until(UNTIL)
 # it's about time to create a TwitterSearch object with our secret tokens
 ts = TwitterSearch(consumer_key = CONSUMER_KEY,consumer_secret = CONSUMER_SECRET,access_token = ACCESS_TOKEN,access_token_secret = ACCESS_TOKEN_SECRET,verify = True)
 #ts.authenticate()
 count = 0
 # connect to mongo
 connection = pymongo.Connection("mongodb://{0}".format(DB_URL), safe=True)
 db=connection.twitter
 users = db.users
 tweets = db.tweets
 new_users = 0
 response = ts.search_tweets_iterable(tso)
 for tweet in response: # this is where the fun actually starts :)
     limit_remaining = ts.get_metadata()['x-rate-limit-remaining']
     limit_reset = ts.get_metadata()['x-rate-limit-reset']
     limit = ts.get_metadata()['x-rate-limit-limit']
     sleep = needs_sleep(limit_remaining,limit_reset)
     if sleep:
         print 'Sleeping {0} seconds to avoid reaching rate limit.'.format(sleep)
         time.sleep(sleep)
     tweet['twitteranalytics_project_id'] = PROJECT_ID
     if (tweets.find({"id":tweet['id'],"twitteranalytics_project_id":PROJECT_ID}).count() == 0):
         dt = datetime.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
         tweet['created_at_dt'] = dt
         if (START_DATE and END_DATE and dt >= START_DATE and dt <= END_DATE) or (not (START_DATE and END_DATE)):
             tweets.insert(tweet)   
     else:
         print 'We reached our newest stored tweet: {0}'.format(tweet['text'].encode('utf-8')) 
         #break