def __init__(self, start_date, hashtags, db_name, collection_name):
     threading.Thread.__init__(self)
     self.auth = functional_tools.FunctionalTools().authenticate_twitter_app(conf['consumer_key'], conf['consumer_secret'])
     self.twitter_api = API(self.auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, timeout=200)
     self.hashtags = hashtags
     self.db_name = db_name
     self.collection_name = collection_name
     self.start_date = start_date
Ejemplo n.º 2
0
 def __init__(self, screen_name, db_name, collection_name):
     threading.Thread.__init__(self)
     self.auth = functional_tools.FunctionalTools(
     ).authenticate_twitter_app(conf['consumer_key'],
                                conf['consumer_secret'])
     self.twitter_api = API(self.auth,
                            wait_on_rate_limit=True,
                            wait_on_rate_limit_notify=True,
                            timeout=200)
     self.SCREEN_NAME = screen_name
     self.db_name = db_name
     self.collection_name = collection_name
Ejemplo n.º 3
0
    def stream_tweets(self, db_address, database, collection, search_list):
        while True:
            try:
                # Connect to Twitter Streaming API
                listener = MyStreamListener(db_address, database, collection,
                                            functional_tools.FunctionalTools(),
                                            search_list)
                stream = Stream(self.auth, listener)

                # Filter Twitter Streams to capture data by the keywords:
                stream.filter(track=search_list)
            except Exception as e:
                logger.error("Error in stream_tweets: ", e)
                continue
Ejemplo n.º 4
0
    def calculate_coordinates(self, new_loc_list):
        """
        Save the coordinate and location pair to database.
        :param new_loc_list: list
            A list contains all new locations.

        :return: None
        """
        logger.info('Thread to calculate coordinates from location starts.')
        convert_df = pd.DataFrame()
        convert_df['Location'] = new_loc_list
        convert_df['Coordinates'] = [self.location_to_coordinate(item) for item in new_loc_list]
        functional_tools.FunctionalTools().save_data(convert_df.to_dict('records'), 'backup', 'locToCoo',
                                                     'insert_many')
        logger.info('Thread to calculate coordinates from location ends.')
        del convert_df
        gc.collect()
    def run(self):
        max_id = None
        TWEETS_PER_QUERY = 60
        records_count = 0
        f_tools = functional_tools.FunctionalTools()
        while True:
            try:
                raw_tweets = self.twitter_client_api.user_timeline(
                    screen_name=self.SCREEN_NAME,
                    tweet_mode='extended',
                    count=TWEETS_PER_QUERY,
                    max_id=max_id)
                if len(raw_tweets) == 0:
                    print("No more tweets found.")
                    print('In total {} tweets are stored in DB.'.format(
                        records_count))
                    print('-----')
                    break
                max_id = raw_tweets[
                    -1].id - 1  # update max_id to harvester earlier data
                df = f_tools.pol_tweets_to_dataframe(raw_tweets,
                                                     self.state_name,
                                                     self.electorate_name,
                                                     self.party_name)

                if df.shape[0] != 0:
                    records_count += df.shape[0]
                    f_tools.save_data(df.to_dict('records'), self.db_name,
                                      self.collection_name, 'update')
                if raw_tweets[-1].created_at < self.start_date:
                    print('Date boundary reached.')
                    print('In total {} tweets are stored in DB.'.format(
                        records_count))
                    print('-----')
                    break

            except TweepError as e1:
                print('Restful tweets error:')
                print(e1)
                break

            except Exception as e2:
                print(e2)
                break
Ejemplo n.º 6
0
    def run(self):
        max_id = None
        TWEETS_PER_QUERY = 100
        records_count = 0
        f_tools = functional_tools.FunctionalTools()

        while True:
            try:
                raw_tweets = self.twitter_api.search(
                    q='@' +
                    self.SCREEN_NAME,  # geocode="-33.854,151.216,180.00km",
                    tweet_mode='extended',
                    count=TWEETS_PER_QUERY,
                    max_id=max_id)
                if not raw_tweets:
                    print("No more mentioned tweets found.")
                    print('In total {} tweets are stored in DB.'.format(
                        records_count))
                    print('-----')
                    break

                max_id = raw_tweets[
                    -1].id - 1  # update max_id to harvester earlier data
                df = f_tools.tweets_to_dataframe(raw_tweets)

                if df.shape[0] != 0:
                    records_count += df.shape[0]
                    f_tools.save_data(df.to_dict('records'), self.db_name,
                                      self.collection_name, 'update')

            except TweepError as e1:
                print('Restful by mentioned error:')
                print(e1)
                break

            except Exception as e2:
                print(e2)
                break
    def run(self):

        max_id = None
        NUM_PER_QUERY = 100
        records_count = 0
        f_tools = functional_tools.FunctionalTools()

        while True:
            try:
                raw_tweets = self.twitter_api.search(q='to:' +
                                                     self.SCREEN_NAME,
                                                     tweet_mode='extended',
                                                     max_id=max_id,
                                                     count=NUM_PER_QUERY)
                if len(raw_tweets) == 0:
                    print("No more replies found.")
                    print('In total {} replies are stored in DB.'.format(
                        records_count))
                    print('-----')
                    break

                max_id = raw_tweets[
                    -1].id - 1  # update max_id to harvester earlier data
                df = f_tools.tweets_to_dataframe(raw_tweets)

                if df.shape[0] != 0:
                    f_tools.save_data(df.to_dict('records'), self.db_name,
                                      self.collection_name, 'update')
                    records_count += df.shape[0]

            except TweepError as e1:
                print('Restful reply error:')
                print(e1)
                break

            except Exception as e2:
                print(e2)
                break
Ejemplo n.º 8
0
    def run(self):
        f_tools = functional_tools.FunctionalTools()
        try:
            user_info = self.twitter_api.get_user(self.SCREEN_NAME)
            if not user_info:
                print("No user information found.")
                print('-----')

            df = f_tools.politician_info_to_dataframe(user_info,
                                                      self.state_name,
                                                      self.electorate_name,
                                                      self.party_name)

            if df.shape[0] != 0:
                f_tools.save_data(df.to_dict('records'), self.db_name,
                                  self.collection_name, 'update')

        except TweepError as e1:
            print('Restful by user info error:')
            print(e1)

        except Exception as e2:
            print(e2)
    def run(self):
        max_id = None
        NUM_PER_QUERY = 100
        records_count = 0
        f_tools = functional_tools.FunctionalTools()

        while True:
            try:
                query = ' OR '.join(self.hashtags)
                raw_tweets = self.twitter_api.search(q=query, result_type='mixed', tweet_mode='extended', max_id=max_id,
                                                     count=NUM_PER_QUERY)
                if len(raw_tweets) == 0:
                    print("No more hashtag tweets found.")
                    print('In total {} tweets are stored in DB.'.format(records_count))
                    print('-----')
                    break

                max_id = raw_tweets[-1].id - 1  # update max_id to harvester earlier data
                df = f_tools.tweets_to_dataframe(raw_tweets)

                if df.shape[0] != 0:
                    f_tools.save_data(df.to_dict('records'), self.db_name, self.collection_name, 'update')
                    records_count += df.shape[0]
                if raw_tweets[-1].created_at < self.start_date:
                    print('Date boundary reached.')
                    print('In total {} tweets are stored in DB.'.format(records_count))
                    print('-----')
                    break
            except TweepError as e:
                print('Restful hashtag error:')
                print(e)
                break

            except Exception as e2:
                print(e2)
                break
Ejemplo n.º 10
0
            if df.shape[0] != 0:
                f_tools.save_data(df.to_dict('records'), self.db_name,
                                  self.collection_name, 'update')

        except TweepError as e1:
            print('Restful by user info error:')
            print(e1)

        except Exception as e2:
            print(e2)


""" testing """
if __name__ == "__main__":
    f_tools = functional_tools.FunctionalTools()
    temp_df = pd.read_csv(
        '../data/full_politician_list.csv',
        usecols=['Name', 'State', 'Electorate', 'Party', 'Screen_Name'])
    politician_list = temp_df['Screen_Name'].dropna().tolist()
    state_list = temp_df['State'].dropna().tolist()
    ele_list = temp_df['Electorate'].dropna().tolist()
    party_list = temp_df['Party'].dropna().tolist()
    result_dict = {}
    for i in range(len(politician_list)):
        print('============================================')
        print('Process: {}/{}'.format(i + 1, len(politician_list)))
        restful_user_info = RestfulUserInfo(politician_list[i], 'test', 'test',
                                            state_list[i], ele_list[i],
                                            party_list[i])
        print("Crawling information of {}.".format(politician_list[i]))
Ejemplo n.º 11
0
                stream = Stream(self.auth, listener)

                # Filter Twitter Streams to capture data by the keywords:
                stream.filter(track=search_list)
            except Exception as e:
                logger.error("Error in stream_tweets: ", e)
                continue


if __name__ == '__main__':
    # Authenticate using config.py and connect to Twitter Streaming API.
    logger.info("Start crawling.")
    # Use processor 7/8
    harvester_id = int(sys.argv[1])
    conf = config[harvester_id]
    auth = functional_tools.FunctionalTools().authenticate_twitter_app(
        conf['consumer_key'], conf['consumer_secret'])

    # Run Streaming to get real time tweets
    twitter_streamer = TwitterStreamer(auth)

    # Get Politician list
    temp_df = pd.read_csv('../data/full_politician_list.csv',
                          usecols=['ScreenName'])
    temp_df = temp_df[temp_df['ScreenName'] != 'NF']['ScreenName'].dropna()
    temp_df = temp_df.apply(lambda x: '@' + x)
    politician_screen_name_list = temp_df.tolist()
    print(politician_screen_name_list)
    db_address = conf['mongodb_address']
    database = conf['mongodb_db_name']
    collection = conf['mongodb_collection_name']
    # Start harvester