def main():
    args = parseArgs()

    if args.log_level == 'debug':
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    if args.log_destination == 'file':
        handler = logging.FileHandler('importSnapshotToMongoDB.log')
    else:
        handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s"))
    logger.addHandler(handler)

    if args.data_dir[-1] != '/':
        args.data_dir+='/'

    uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, args.database)
    logger.info("Connecting to %s"%uri)
    client = pymongo.MongoClient(uri)[args.database]
    logger.info("Connected to%s"%uri)

    files = glob.glob(args.data_dir+'*.data')
    for file in files:
        logger.info("reading %s"%file)
        tweets = [date_hook(ujson.loads(l)) for l in open(file)]
        logger.info("%d tweets read from %s"%(len(tweets),file))
        if len(tweets)>0:
            if not args.skip_tokenization:
                logger.info("Tokenizing tweets")
                tokenizer = Tokenizer(preserve_case=True)
                tokenized_tweets = [tokenizer.tokenize(tweet['twitter']['text']) for tweet in tweets]
                logger.info("Tagging tweets")
                tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file)
                tagged_tweets = tagger.tag(tokenized_tweets)
                for i in range(len(tweets)):
                    tweets[i]['tagged_tweet'] = tagged_tweets[i]
            logger.info("Loading tweets into database")
            client['tweets'].insert(tweets)

    logger.info("done.")
import glob
import ujson
from happyfuntokenizing import Tokenizer
from TreeTaggerWrapper import TreeTagger

path_to_data='../data/snapshots/2014-10-20/'
files = glob.glob(path_to_data+'2014-1*.data')

tokenizer = Tokenizer(preserve_case=True)
tagger = TreeTagger(path_to_bin='/Users/jmague/Documents/work/treetagger/bin/tree-tagger', path_to_param='/Users/jmague/Documents/work/treetagger/lib/french-utf8.par')


for fileName in files:
    print fileName
    file = open(fileName)
    tweets=[ujson.loads(l) for l in file]
    tokenized_tweets= [tokenizer.tokenize(tweet['tweet']) for tweet in tweets]
    tagged_tweets = tagger.tag(tokenized_tweets)
    for i in range(len(tweets)):
        tweets[i]['tagged_tweet'] = tagged_tweets[i]
    output_file_name = fileName[:-5]+'-tagged.data'
    file = open(output_file_name,'w')
    for tweet in tweets:
        file.write("%s\n"%ujson.dumps(tweet))
def main():
    args = parseArgs()

    if args.log_level == 'debug':
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)
    if args.log_destination == 'file':
        handler = logging.FileHandler('importSnapshotToMongoDB.log')
    else:
        handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s"))
    logger.addHandler(handler)

    if args.snapshot_dir[-1] != '/':
        args.snapshot_dir+='/'

    if args.database:
        database = args.database
    else:
        database = "snapshot_"+args.snapshot_dir.split('/')[-2]
    uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, database)
    logger.info("Connecting to %s"%uri)
    client = pymongo.MongoClient(uri)[database]
    logger.info("Connected to%s"%uri)


    files = glob.glob(args.snapshot_dir+'*.data')
    for file in files:
        logger.info("reading %s"%file)
        tweets = [date_hook(ujson.loads(l)) for l in open(file)]
        logger.info("%d tweets read from %s"%(len(tweets),file))
        if len(tweets)>0:
            if not args.skip_tokenization:
                logger.info("Tokenizing tweets")
                tokenizer = Tokenizer(preserve_case=True)
                tokenized_tweets = [tokenizer.tokenize(tweet['tweet']) for tweet in tweets]
                logger.info("Tagging tweets")
                tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file)
                tagged_tweets = tagger.tag(tokenized_tweets)
                for i in range(len(tweets)):
                    tweets[i]['tagged_tweet'] = tagged_tweets[i]
            logger.info("Loading tweets into database")
            client['tweets'].insert(tweets)

    logger.info("Loading users from %susers.db"%args.snapshot_dir)
    connection = sqlite3.connect("%susers.db"%args.snapshot_dir)
    connection.row_factory = sqlite3.Row
    cursor = connection.cursor()
    logger.info('fetching users')
    cursor.execute('SELECT id,friends FROM users where friends is not NULL')
    users = cursor.fetchall()
    logger.info('%d users fetched'%len(users))
    bulk_size=25000
    nUsersInserted=0
    usersToBeInserted=[]
    for user in users:
        id = user['id']
        friends = ujson.loads(user['friends'])
        usersToBeInserted.append({'id':id, 'friends':friends})
        if len(usersToBeInserted)>=bulk_size:
            client['users'].insert(usersToBeInserted)
            usersToBeInserted=[]
            nUsersInserted+=bulk_size
            logger.info("%d users insered"%nUsersInserted)
    client['users'].insert(usersToBeInserted)
    logger.info("all users insered.")

    logger.info("done.")