def __init__(self, start_date, hashtags, db_name, collection_name): threading.Thread.__init__(self) self.auth = functional_tools.FunctionalTools().authenticate_twitter_app(conf['consumer_key'], conf['consumer_secret']) self.twitter_api = API(self.auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, timeout=200) self.hashtags = hashtags self.db_name = db_name self.collection_name = collection_name self.start_date = start_date
def __init__(self, screen_name, db_name, collection_name): threading.Thread.__init__(self) self.auth = functional_tools.FunctionalTools( ).authenticate_twitter_app(conf['consumer_key'], conf['consumer_secret']) self.twitter_api = API(self.auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, timeout=200) self.SCREEN_NAME = screen_name self.db_name = db_name self.collection_name = collection_name
def stream_tweets(self, db_address, database, collection, search_list): while True: try: # Connect to Twitter Streaming API listener = MyStreamListener(db_address, database, collection, functional_tools.FunctionalTools(), search_list) stream = Stream(self.auth, listener) # Filter Twitter Streams to capture data by the keywords: stream.filter(track=search_list) except Exception as e: logger.error("Error in stream_tweets: ", e) continue
def calculate_coordinates(self, new_loc_list): """ Save the coordinate and location pair to database. :param new_loc_list: list A list contains all new locations. :return: None """ logger.info('Thread to calculate coordinates from location starts.') convert_df = pd.DataFrame() convert_df['Location'] = new_loc_list convert_df['Coordinates'] = [self.location_to_coordinate(item) for item in new_loc_list] functional_tools.FunctionalTools().save_data(convert_df.to_dict('records'), 'backup', 'locToCoo', 'insert_many') logger.info('Thread to calculate coordinates from location ends.') del convert_df gc.collect()
def run(self): max_id = None TWEETS_PER_QUERY = 60 records_count = 0 f_tools = functional_tools.FunctionalTools() while True: try: raw_tweets = self.twitter_client_api.user_timeline( screen_name=self.SCREEN_NAME, tweet_mode='extended', count=TWEETS_PER_QUERY, max_id=max_id) if len(raw_tweets) == 0: print("No more tweets found.") print('In total {} tweets are stored in DB.'.format( records_count)) print('-----') break max_id = raw_tweets[ -1].id - 1 # update max_id to harvester earlier data df = f_tools.pol_tweets_to_dataframe(raw_tweets, self.state_name, self.electorate_name, self.party_name) if df.shape[0] != 0: records_count += df.shape[0] f_tools.save_data(df.to_dict('records'), self.db_name, self.collection_name, 'update') if raw_tweets[-1].created_at < self.start_date: print('Date boundary reached.') print('In total {} tweets are stored in DB.'.format( records_count)) print('-----') break except TweepError as e1: print('Restful tweets error:') print(e1) break except Exception as e2: print(e2) break
def run(self): max_id = None TWEETS_PER_QUERY = 100 records_count = 0 f_tools = functional_tools.FunctionalTools() while True: try: raw_tweets = self.twitter_api.search( q='@' + self.SCREEN_NAME, # geocode="-33.854,151.216,180.00km", tweet_mode='extended', count=TWEETS_PER_QUERY, max_id=max_id) if not raw_tweets: print("No more mentioned tweets found.") print('In total {} tweets are stored in DB.'.format( records_count)) print('-----') break max_id = raw_tweets[ -1].id - 1 # update max_id to harvester earlier data df = f_tools.tweets_to_dataframe(raw_tweets) if df.shape[0] != 0: records_count += df.shape[0] f_tools.save_data(df.to_dict('records'), self.db_name, self.collection_name, 'update') except TweepError as e1: print('Restful by mentioned error:') print(e1) break except Exception as e2: print(e2) break
def run(self): max_id = None NUM_PER_QUERY = 100 records_count = 0 f_tools = functional_tools.FunctionalTools() while True: try: raw_tweets = self.twitter_api.search(q='to:' + self.SCREEN_NAME, tweet_mode='extended', max_id=max_id, count=NUM_PER_QUERY) if len(raw_tweets) == 0: print("No more replies found.") print('In total {} replies are stored in DB.'.format( records_count)) print('-----') break max_id = raw_tweets[ -1].id - 1 # update max_id to harvester earlier data df = f_tools.tweets_to_dataframe(raw_tweets) if df.shape[0] != 0: f_tools.save_data(df.to_dict('records'), self.db_name, self.collection_name, 'update') records_count += df.shape[0] except TweepError as e1: print('Restful reply error:') print(e1) break except Exception as e2: print(e2) break
def run(self): f_tools = functional_tools.FunctionalTools() try: user_info = self.twitter_api.get_user(self.SCREEN_NAME) if not user_info: print("No user information found.") print('-----') df = f_tools.politician_info_to_dataframe(user_info, self.state_name, self.electorate_name, self.party_name) if df.shape[0] != 0: f_tools.save_data(df.to_dict('records'), self.db_name, self.collection_name, 'update') except TweepError as e1: print('Restful by user info error:') print(e1) except Exception as e2: print(e2)
def run(self): max_id = None NUM_PER_QUERY = 100 records_count = 0 f_tools = functional_tools.FunctionalTools() while True: try: query = ' OR '.join(self.hashtags) raw_tweets = self.twitter_api.search(q=query, result_type='mixed', tweet_mode='extended', max_id=max_id, count=NUM_PER_QUERY) if len(raw_tweets) == 0: print("No more hashtag tweets found.") print('In total {} tweets are stored in DB.'.format(records_count)) print('-----') break max_id = raw_tweets[-1].id - 1 # update max_id to harvester earlier data df = f_tools.tweets_to_dataframe(raw_tweets) if df.shape[0] != 0: f_tools.save_data(df.to_dict('records'), self.db_name, self.collection_name, 'update') records_count += df.shape[0] if raw_tweets[-1].created_at < self.start_date: print('Date boundary reached.') print('In total {} tweets are stored in DB.'.format(records_count)) print('-----') break except TweepError as e: print('Restful hashtag error:') print(e) break except Exception as e2: print(e2) break
if df.shape[0] != 0: f_tools.save_data(df.to_dict('records'), self.db_name, self.collection_name, 'update') except TweepError as e1: print('Restful by user info error:') print(e1) except Exception as e2: print(e2) """ testing """ if __name__ == "__main__": f_tools = functional_tools.FunctionalTools() temp_df = pd.read_csv( '../data/full_politician_list.csv', usecols=['Name', 'State', 'Electorate', 'Party', 'Screen_Name']) politician_list = temp_df['Screen_Name'].dropna().tolist() state_list = temp_df['State'].dropna().tolist() ele_list = temp_df['Electorate'].dropna().tolist() party_list = temp_df['Party'].dropna().tolist() result_dict = {} for i in range(len(politician_list)): print('============================================') print('Process: {}/{}'.format(i + 1, len(politician_list))) restful_user_info = RestfulUserInfo(politician_list[i], 'test', 'test', state_list[i], ele_list[i], party_list[i]) print("Crawling information of {}.".format(politician_list[i]))
stream = Stream(self.auth, listener) # Filter Twitter Streams to capture data by the keywords: stream.filter(track=search_list) except Exception as e: logger.error("Error in stream_tweets: ", e) continue if __name__ == '__main__': # Authenticate using config.py and connect to Twitter Streaming API. logger.info("Start crawling.") # Use processor 7/8 harvester_id = int(sys.argv[1]) conf = config[harvester_id] auth = functional_tools.FunctionalTools().authenticate_twitter_app( conf['consumer_key'], conf['consumer_secret']) # Run Streaming to get real time tweets twitter_streamer = TwitterStreamer(auth) # Get Politician list temp_df = pd.read_csv('../data/full_politician_list.csv', usecols=['ScreenName']) temp_df = temp_df[temp_df['ScreenName'] != 'NF']['ScreenName'].dropna() temp_df = temp_df.apply(lambda x: '@' + x) politician_screen_name_list = temp_df.tolist() print(politician_screen_name_list) db_address = conf['mongodb_address'] database = conf['mongodb_db_name'] collection = conf['mongodb_collection_name'] # Start harvester