def on_status(self, status): # type: (SimpleFireStreamListener, Status) -> None if SimpleFireStreamListener.is_relevant(status, verbose=True): text = get_status_text(status) print("Relevant: " + text.encode("UTF-8"))
def on_status(self, status): # type: (MongoDBStreamListener, Status) -> None # Make text safe to print in console. safe_text = get_status_text(status).encode('UTF-8') # If the status is relevant, if MongoDBStreamListener.is_relevant(status): # Insert it into our database. self.mongodatabase[self.TWEETS_TABLE].insert_one(status._json) print("Hit: " + safe_text) else: print("Ign: " + safe_text)
def is_relevant(status, verbose=False): # type: (Status, bool) -> bool """ Tells you if a Status is relevant. :param status: The status. :param verbose: Should we print irrelevant statuses? :return: Whether or not the status is relevant. """ if 'fire' in get_status_text(status): return True return False
def is_relevant(status, verbose=False): # type: (Status, bool) -> bool """ Tells you if a Status is relevant. :param status: The status. :param verbose: Should we print irrelevant statuses? :return: Whether or not the status is relevant. """ text = get_status_text(status) if 'fire' in text: print("Relevant:") return True if verbose: # Show snapshot of irrelevant tweet print("Not relevant: {}".format(text)) return False
""" Demonstrates the ability to search for fire regardless of location. """ # noinspection PyUnresolvedReferences import __init__ import tweepy from pprint import pprint from twitter_fire_scraper.twitter import TwitterAuthentication from twitter_fire_scraper.util import get_status_text if __name__ == "__main__": # Set up twitter auth. twauth = TwitterAuthentication.autodetect_twitter_auth() api = tweepy.API(twauth.oauth_handler) print( "Just searching for 'fire'... Probably not going to get us Chicago fire incidents." ) for status in api.search("fire"): print(get_status_text(status))
# Retrieve however many tweets we want, and store that in our dictionary. all_tweets[search_term] = [ status for status in cursor.items(MAX_TWEETS) ] print("{n} hits for {ht}:".format(n=len(all_tweets[search_term]), ht=search_term)) # For all statuses that we retrieved, for status in all_tweets[search_term][0:5]: # type: Status print(" " * 4), # Prefix with four spaces to show hierarchy. print("<{}> :".format(status_to_url(status))) # URL of tweet. print(" " * 4), # Status belonging to URL. print(get_status_text(status).encode("UTF-8")) # Status text. print print("Final results of scraping {n} tweets each from these search terms:". format(n=MAX_TWEETS)) print(", ".join(search_terms)) # Determine unique statuses found from all keywords # Unique statuses. Sets can contain no duplicate elements. unique_status_ids = set() # type: set[long] total_statuses = 0 # Total amount of statuses found. for keyword, statuses in all_tweets.items(): for status in statuses:
def save_statusdict_to_csv(self, statusdict, filepath, overwrite=False): # type: (Scraper, Dict[str, List[Status]], str, bool) -> str """Save a status dict to a CSV file. :param statusdict A {str: [Status, Status, ...]} dictionary. :param filepath A path to the file to output to.""" dirname = os.path.dirname(filepath) if not os.path.exists(dirname): raise NotADirectoryError("Directory {} does not exist!".format(dirname)) if os.path.isfile(filepath): if not overwrite: raise FileExistsError("File at '{}' already exists!".format(filepath)) fieldnames = [ 'category', 'tweet_id', 'text', 'date', 'retweet_count', 'geo', 'coordinates', 'place_id', 'place_centroid', 'place_country', 'place_country_code', 'place_full_name', 'place_name', 'place_type' ] with open(filepath, 'w', encoding='utf-16', newline='') as file: fileWriter = csv.DictWriter(file, delimiter=Scraper.CSV_DELIMITER, quotechar='"', quoting=csv.QUOTE_ALL, fieldnames=fieldnames) fileWriter.writeheader() for keyword, statuses in statusdict.items(): for status in statuses: # type: Status data = { "category": keyword, "tweet_id": status.id, "text": get_status_text(status), "date": status.created_at, 'retweet_count': status.retweet_count, } if status.geo: data.update({ "geo": ','.join(str(x) for x in status.geo['coordinates']), }) if status.coordinates: data.update({ 'coordinates': ','.join(str(x) for x in status.coordinates['coordinates']), }) # If the status has a place, then add its data! if status.place: data.update({ 'place_id': status.place.id, # We have to use the API for this one to look it up. 'place_centroid': ','.join(str(x) for x in self.api.geo_id(status.place.id).centroid), 'place_country': status.place.country, 'place_country_code': status.place.country_code, 'place_full_name': status.place.full_name, 'place_name': status.place.name, 'place_type': status.place.place_type, }) # Write all the data we've collected so far. fileWriter.writerow(data) return filepath