def save_tweet(tweet_collection, tweet): """ Saves a tweet to mongo collection, adds random number for sampling. Returns number of tweets saved (1 or 0) """ json_tweet = tweet._json add_random_to_tweet(json_tweet) add_timestamp_to_tweet(json_tweet) try: tweet_collection.save(json_tweet) except DuplicateKeyError: logger.warn("Tweet {0} duplicate in DB".format(tweet.id)) return None return json_tweet["id"]
def import_tweets(host, port, user, password, database, collection, infile, transform=True, stream_json=False): """ Loads each line from the given infile into a json object, and directly inserts that to the given database and collection. NOTE: On fail, DOES NOT ROLL BACK (must manually remove records if clean necessary) """ print "Importing tweets from '{0}' into {1}:{2}".format(infile, database, collection) # Create connection and authenticate client = MongoClient(host, int(port)) dbh = client[database] assert dbh.connection == client if not dbh.authenticate(user, password): raise ConnectionFailure("Mongo DB Authentication for User {0}, DB {1} failed".format(user, database)) col = dbh[collection] # Ensure there is a unique ID index on tweet collection print "Ensuring indexes on {0}:{1}".format(database, collection) col.ensure_index("id", name="unique_id", unique=True, drop_dups=True, background=True) # Read tweets via the Custom json decoder (made for reading streams with multiple tweets) print "Importing tweets in {0}".format(infile) imported = 0 skipped = 0 with open(infile) as inhandle: if stream_json: tweets = NonListStreamJsonListLoader(infile) warnings.warn("Using NonListStreamJsonListLoader (SLOW).") else: tweets = loads(inhandle.read(), cls=ConcatJSONDecoder) for tweet in tweets: if "id_str" not in tweet: warnings.warn("Data read from file\n\t{0}\nnot a valid tweet".format(tweet)) continue if transform: add_random_to_tweet(tweet) add_timestamp_to_tweet(tweet) try: col.insert(tweet, safe=True) except DuplicateKeyError as e: warnings.warn("Tweet already exists in DB. Skipping..") skipped += 1 continue imported += 1 print "Imported {0}\r".format(imported), print "Importing complete. Inserted {0} documents in {1}:{2}, skipped {3}".format( imported, database, collection, skipped)
def import_tweets(host, port, user, password, database, collection, infile, transform=True): """ Loads each line from the given infile into a json object, and directly inserts that to the given database and collection. NOTE: On fail, DOES NOT ROLL BACK (must manually remove records if clean necessary) """ print "Importing tweets from '{0}' into {1}:{2}".format(infile, database, collection) # Create connection and authenticate client = MongoClient(host, int(port)) dbh = client[database] assert dbh.connection == client if not dbh.authenticate(user, password): raise ConnectionFailure("Mongo DB Authentication for User {0}, DB {1} failed".format(user, database)) col = dbh[collection] # Ensure there is a unique ID index on tweet collection print "Ensuring indexes on {0}:{1}".format(database, collection) col.ensure_index("id_str", name="unique_id", unique=True, drop_dups=True, background=True) # Read tweets via the Custom json decoder (made for reading streams with multiple tweets) print "Importing tweets in {0}".format(infile) imported = 0 skipped = 0 with open(infile) as inhandle: tweets = loads(inhandle.read(), cls=ConcatJSONDecoder) for tweet in tweets: if "id_str" not in tweet: warnings.warn("Data read from file\n\t{0}\nnot a valid tweet".format(tweet)) continue if transform: add_random_to_tweet(tweet) add_timestamp_to_tweet(tweet) try: col.insert(tweet, safe=True) except DuplicateKeyError as e: warnings.warn("Tweet already exists in DB. Skipping..") skipped += 1 continue imported += 1 print "Imported {0}\r".format(imported), print "Importing complete. Inserted {0} documents in {1}:{2}, skipped {3}".format( imported, database, collection, skipped)