Ejemplo n.º 1
0
def save_tweet(tweet_collection, tweet):
    """
    Saves a tweet to mongo collection, adds random number for sampling. Returns number
    of tweets saved (1 or 0)
    """
    json_tweet = tweet._json
    add_random_to_tweet(json_tweet)
    add_timestamp_to_tweet(json_tweet)
    try:
        tweet_collection.save(json_tweet)
    except DuplicateKeyError:
        logger.warn("Tweet {0} duplicate in DB".format(tweet.id))
        return None
    return json_tweet["id"]
Ejemplo n.º 2
0
def import_tweets(host, port, user, password, database, collection, infile, transform=True, stream_json=False):
    """
    Loads each line from the given infile into a json object, and directly inserts that to the
    given database and collection. 
    NOTE: On fail, DOES NOT ROLL BACK (must manually remove records if clean necessary)
    """
    print "Importing tweets from '{0}' into {1}:{2}".format(infile, database, collection)

    # Create connection and authenticate
    client = MongoClient(host, int(port))
    dbh = client[database]
    assert dbh.connection == client
    if not dbh.authenticate(user, password):
        raise ConnectionFailure("Mongo DB Authentication for User {0}, DB {1} failed".format(user, database))
    col = dbh[collection]

    # Ensure there is a unique ID index on tweet collection
    print "Ensuring indexes on {0}:{1}".format(database, collection)
    col.ensure_index("id", name="unique_id", unique=True, drop_dups=True, background=True)

    # Read tweets via the Custom json decoder (made for reading streams with multiple tweets)
    print "Importing tweets in {0}".format(infile)
    imported = 0
    skipped = 0
    with open(infile) as inhandle:
        if stream_json:
            tweets = NonListStreamJsonListLoader(infile)
            warnings.warn("Using NonListStreamJsonListLoader (SLOW).")
        else:
            tweets = loads(inhandle.read(), cls=ConcatJSONDecoder)
        for tweet in tweets:
            if "id_str" not in tweet:
                warnings.warn("Data read from file\n\t{0}\nnot a valid tweet".format(tweet))
                continue

            if transform:
                add_random_to_tweet(tweet)
                add_timestamp_to_tweet(tweet)

            try:
                col.insert(tweet, safe=True)
            except DuplicateKeyError as e:
                warnings.warn("Tweet already exists in DB. Skipping..")
                skipped += 1
                continue
            imported += 1
            print "Imported {0}\r".format(imported), 
    
    print "Importing complete. Inserted {0} documents in {1}:{2}, skipped {3}".format(
            imported, database, collection, skipped)
Ejemplo n.º 3
0
def save_tweet(tweet_collection, tweet):
    """
    Saves a tweet to mongo collection, adds random number for sampling. Returns number
    of tweets saved (1 or 0)
    """
    json_tweet = tweet._json
    add_random_to_tweet(json_tweet)
    add_timestamp_to_tweet(json_tweet)
    try:
        tweet_collection.save(json_tweet)
    except DuplicateKeyError:
        logger.warn("Tweet {0} duplicate in DB".format(tweet.id))
        return None
    return json_tweet["id"]
Ejemplo n.º 4
0
def import_tweets(host, port, user, password, database, collection, infile, transform=True):
    """
    Loads each line from the given infile into a json object, and directly inserts that to the
    given database and collection. 
    NOTE: On fail, DOES NOT ROLL BACK (must manually remove records if clean necessary)
    """
    print "Importing tweets from '{0}' into {1}:{2}".format(infile, database, collection)

    # Create connection and authenticate
    client = MongoClient(host, int(port))
    dbh = client[database]
    assert dbh.connection == client
    if not dbh.authenticate(user, password):
        raise ConnectionFailure("Mongo DB Authentication for User {0}, DB {1} failed".format(user, database))
    col = dbh[collection]

    # Ensure there is a unique ID index on tweet collection
    print "Ensuring indexes on {0}:{1}".format(database, collection)
    col.ensure_index("id_str", name="unique_id", unique=True, drop_dups=True, background=True)

    # Read tweets via the Custom json decoder (made for reading streams with multiple tweets)
    print "Importing tweets in {0}".format(infile)
    imported = 0
    skipped = 0
    with open(infile) as inhandle:
        tweets = loads(inhandle.read(), cls=ConcatJSONDecoder)
        for tweet in tweets:
            if "id_str" not in tweet:
                warnings.warn("Data read from file\n\t{0}\nnot a valid tweet".format(tweet))
                continue

            if transform:
                add_random_to_tweet(tweet)
                add_timestamp_to_tweet(tweet)

            try:
                col.insert(tweet, safe=True)
            except DuplicateKeyError as e:
                warnings.warn("Tweet already exists in DB. Skipping..")
                skipped += 1
                continue
            imported += 1
            print "Imported {0}\r".format(imported), 
    
    print "Importing complete. Inserted {0} documents in {1}:{2}, skipped {3}".format(
            imported, database, collection, skipped)