def main(): args = parseArgs() if args.log_level == 'debug': logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if args.log_destination == 'file': handler = logging.FileHandler('importSnapshotToMongoDB.log') else: handler = logging.StreamHandler(sys.stderr) handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s")) logger.addHandler(handler) if args.data_dir[-1] != '/': args.data_dir+='/' uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, args.database) logger.info("Connecting to %s"%uri) client = pymongo.MongoClient(uri)[args.database] logger.info("Connected to%s"%uri) files = glob.glob(args.data_dir+'*.data') for file in files: logger.info("reading %s"%file) tweets = [date_hook(ujson.loads(l)) for l in open(file)] logger.info("%d tweets read from %s"%(len(tweets),file)) if len(tweets)>0: if not args.skip_tokenization: logger.info("Tokenizing tweets") tokenizer = Tokenizer(preserve_case=True) tokenized_tweets = [tokenizer.tokenize(tweet['twitter']['text']) for tweet in tweets] logger.info("Tagging tweets") tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file) tagged_tweets = tagger.tag(tokenized_tweets) for i in range(len(tweets)): tweets[i]['tagged_tweet'] = tagged_tweets[i] logger.info("Loading tweets into database") client['tweets'].insert(tweets) logger.info("done.")
import glob import ujson from happyfuntokenizing import Tokenizer from TreeTaggerWrapper import TreeTagger path_to_data='../data/snapshots/2014-10-20/' files = glob.glob(path_to_data+'2014-1*.data') tokenizer = Tokenizer(preserve_case=True) tagger = TreeTagger(path_to_bin='/Users/jmague/Documents/work/treetagger/bin/tree-tagger', path_to_param='/Users/jmague/Documents/work/treetagger/lib/french-utf8.par') for fileName in files: print fileName file = open(fileName) tweets=[ujson.loads(l) for l in file] tokenized_tweets= [tokenizer.tokenize(tweet['tweet']) for tweet in tweets] tagged_tweets = tagger.tag(tokenized_tweets) for i in range(len(tweets)): tweets[i]['tagged_tweet'] = tagged_tweets[i] output_file_name = fileName[:-5]+'-tagged.data' file = open(output_file_name,'w') for tweet in tweets: file.write("%s\n"%ujson.dumps(tweet))
def main(): args = parseArgs() if args.log_level == 'debug': logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) if args.log_destination == 'file': handler = logging.FileHandler('importSnapshotToMongoDB.log') else: handler = logging.StreamHandler(sys.stderr) handler.setFormatter(logging.Formatter("%(asctime)s; %(levelname)s; %(message)s")) logger.addHandler(handler) if args.snapshot_dir[-1] != '/': args.snapshot_dir+='/' if args.database: database = args.database else: database = "snapshot_"+args.snapshot_dir.split('/')[-2] uri = "mongodb://%s:%d/%s"%(args.mongoServerHost, args.mongoServerPort, database) logger.info("Connecting to %s"%uri) client = pymongo.MongoClient(uri)[database] logger.info("Connected to%s"%uri) files = glob.glob(args.snapshot_dir+'*.data') for file in files: logger.info("reading %s"%file) tweets = [date_hook(ujson.loads(l)) for l in open(file)] logger.info("%d tweets read from %s"%(len(tweets),file)) if len(tweets)>0: if not args.skip_tokenization: logger.info("Tokenizing tweets") tokenizer = Tokenizer(preserve_case=True) tokenized_tweets = [tokenizer.tokenize(tweet['tweet']) for tweet in tweets] logger.info("Tagging tweets") tagger = TreeTagger(path_to_bin=args.path_to_treetagger, path_to_param=args.path_to_treetagger_param_file) tagged_tweets = tagger.tag(tokenized_tweets) for i in range(len(tweets)): tweets[i]['tagged_tweet'] = tagged_tweets[i] logger.info("Loading tweets into database") client['tweets'].insert(tweets) logger.info("Loading users from %susers.db"%args.snapshot_dir) connection = sqlite3.connect("%susers.db"%args.snapshot_dir) connection.row_factory = sqlite3.Row cursor = connection.cursor() logger.info('fetching users') cursor.execute('SELECT id,friends FROM users where friends is not NULL') users = cursor.fetchall() logger.info('%d users fetched'%len(users)) bulk_size=25000 nUsersInserted=0 usersToBeInserted=[] for user in users: id = user['id'] friends = ujson.loads(user['friends']) usersToBeInserted.append({'id':id, 'friends':friends}) if len(usersToBeInserted)>=bulk_size: client['users'].insert(usersToBeInserted) usersToBeInserted=[] nUsersInserted+=bulk_size logger.info("%d users insered"%nUsersInserted) client['users'].insert(usersToBeInserted) logger.info("all users insered.") logger.info("done.")