def hydrate(idlist_file="data/example_dataset_tweet_ids.txt"): """ This function reads a file with tweet IDs and then loads them through the API into the database. Prepare to wait quite a bit, depending on the size of the dataset. """ ids_to_fetch = set() for line in open(idlist_file, "r"): # Remove newline character through .strip() # Convert to int since that's what the database uses ids_to_fetch.add(int(line.strip())) # Find a list of Tweets that we already have ids_in_db = set(t.id for t in database.Tweet.select(database.Tweet.id)) # Sets have an efficient .difference() method that returns IDs only present # in the first set, but not in the second. ids_to_fetch = ids_to_fetch.difference(ids_in_db) logging.warning( "\nLoaded a list of {0} tweet IDs to hydrate".format(len(ids_to_fetch))) # Set up a progressbar bar = Bar('Fetching tweets', max=len(ids_to_fetch), suffix='%(eta)ds') for page in rest.fetch_tweet_list(ids_to_fetch): bar.next(len(page)) for tweet in page: database.create_tweet_from_dict(tweet) bar.finish() logging.warning("Done hydrating!")
def hydrate(idlist_file="data/example_dataset_tweet_ids.txt"): """ This function reads a file with tweet IDs and then loads them through the API into the database. Prepare to wait quite a bit, depending on the size of the dataset. """ ids_to_fetch = set() for line in open(idlist_file, "r"): # Remove newline character through .strip() # Convert to int since that's what the database uses ids_to_fetch.add(int(line.strip())) # Find a list of Tweets that we already have ids_in_db = set(t.id for t in database.Tweet.select(database.Tweet.id)) # Sets have an efficient .difference() method that returns IDs only present # in the first set, but not in the second. ids_to_fetch = ids_to_fetch.difference(ids_in_db) logging.warning("\nLoaded a list of {0} tweet IDs to hydrate".format( len(ids_to_fetch))) # Set up a progressbar bar = Bar('Fetching tweets', max=len(ids_to_fetch), suffix='%(eta)ds') for page in rest.fetch_tweet_list(ids_to_fetch): bar.next(len(page)) for tweet in page: database.create_tweet_from_dict(tweet) bar.finish() logging.warning("Done hydrating!")
def import_json(fi): """ Load json data from a file into the database. """ logging.warning("Loading tweets from json file {0}".format(fi)) for line in open(fi, "rb"): data = json.loads(line.decode('utf-8')) database.create_tweet_from_dict(data)
def save_user_archive_to_database(): """ Fetch all available tweets for one user and save them to the database. """ archive_generator = rest.fetch_user_archive("lessig") for page in archive_generator: for tweet in page: database.create_tweet_from_dict(tweet) logging.warning(u"Wrote tweets from @lessig to database")
def load_from_files(files, searchterm): # Files is a list of json files, searchterm is the search used for file in files: print("File ", file) logger.info("file %s", file) for tweet in iterate_file(file): if tweet: database.create_tweet_from_dict(tweet, searchterm) else: continue return
def add_to_database(tweets, searchterm): counter = 0 print(len(tweets)) for tweet in tweets: if tweet: data = json.loads(tweet.AsJsonString()) t = mytools.create_tweet_from_dict(data, searchterm) if t: counter += 1 else: logging.error("Did not save tweet %s" % data["id"]) else: continue return counter