def geo_code_import(mongodb,sqldb): db = dbConnection() print mongodb,sqldb db.create_mongo_connections(mongo_options=[mongodb]) db.create_sql_connections(sql_options=[sqldb]) written_ids = open('written_ids_proposal.txt','w') #sql db query query = "select id,meta_gps_lat,meta_gps_long from tweets where meta_gps_lat != '' and meta_gps_long != ''" db.sql_connections['boston'].execute(query) for x in db.sql_connections['boston']: query = str(x[0]) value1 = str(x[1]) value2 = str(x[2]) print query,value1,value2 written_ids.write('"%s","%s","%s"\n' % (query,value1,value2)) db.m_connections['new_boston'].update({'user.id':query}, {'$set':{ 'place':{ 'coordinates':{ 'type':'Point', 'coordinates':[value2,value1] } } } })
def load_tweets_from_mongo(db_name): db = dbConnection() db.create_mongo_connections(mongo_options=[db_name]) tweets = db.m_connections.find({'counts.urls':{'$gte':1}}) tweet_queue = [x for x in tweets] return tweet_queue
def create_instersection_codes(db1,db2,gnip_first=True): db = dbConnection() db.create_mongo_connections(mongo_options=[db1]) db.create_mongo_connections(mongo_options=[db2]) raw_data = db.m_connections[db1].find() if gnip_first is True: for x in raw_data: print x['id'] new_data = db.m_connections[db2].find_one({'id':str(x['id'])}) if new_data != None: db.m_connections[db1].update({'id':long(x['id'])}, {'$set': {'intersect':0} }) db.m_connections[db2].update({'id':str(x['id'])}, {'$set': {'intersect':0} }) else: new_kw = ['watertown','mit'] if (set(new_kw) & set(x['track_kw']['mentions'])) or (set(new_kw) & set(x['track_kw']['hashtags'])) or (set(new_kw) & set(x['track_kw']['text'])): db.m_connections[db1].update({'id':long(x['id'])}, {'$set': {'intersect':3} }) else: db.m_connections[db1].update({'id':long(x['id'])}, {'$set': {'intersect':2} }) else: for x in raw_data: print x['id'] new_data = db.m_connections[db2].find_one({'id':long(x['id'])}) if new_data != None: db.m_connections[db1].update({'id':str(x['id'])}, {'$set': {'intersect':0} }) db.m_connections[db2].update({'id':long(x['id'])}, {'$set': {'intersect':0} }) else: db.m_connections[db1].update({'id':str(x['id'])}, {'$set': {'intersect':1} })
def total_intersection(db1,db2): db = dbConnection() db.create_mongo_connections(mongo_options=[db1]) db.create_mongo_connections(mongo_options=[db2]) raw_data = db.m_connections[db1].find() count = 0 for x in raw_data: new_data = db.m_connections[db2].find_one({'id':str(x['id'])}) if new_data != None: count += 1 result = '%s in %s: %s' % (db1,db2,count) print result
def code_update_mongo_to_sql(mongodb,sqldb,table,rumor): db = dbConnection() print mongodb,sqldb db.create_mongo_connections(mongo_options=[mongodb]) db.create_sql_connections(sql_options=[sqldb]) written_ids = open('girl_running_update_log.txt','w') #sql db query query = "select id,code from %s" % table db.sql_connections[sqldb].execute(query) for x in db.sql_connections[sqldb].fetchall(): query = str(x[0]) value = str(x[1]) print query,value written_ids.write('"%s","%s"\n' % (query,value)) db.m_connections[mongodb].update({'user.id':query, 'codes.rumor':rumor}, {'$set':{'codes.$.code':value,}})
def author_code_import(mongodb,sqldb): db = dbConnection() db.create_mongo_connections(mongo_options=[mongodb]) db.create_sql_connections(sql_options=[sqldb]) written_ids = open('written_ids_proposal.txt','w') #sql db query query = "select id,author from tweets" db.sql_connections['boston'].execute(query) for x in db.sql_connections[sqldb]: query = str(x[0]) value1 = str(x[1]) print query,value1 written_ids.write('"%s","%s"\n' % (query,value1)) db.m_connections[mongodb].update({'user.id':query}, {'$set':{ 'user.screen_name':value1, } })
def single_update(ch, method, properties, body): db_name = [config_info.tweet_db, config_info.processing_errors_db] db = dbConnection() db.create_mongo_connections(mongo_options=db_name) try: tweet = simplejson.loads(body) f = open(config_info.tweet_log, "a") tweet["created_ts"] = to_datetime(tweet["created_at"]) tweet["user"]["created_ts"] = to_datetime(tweet["user"]["created_at"]) db.m_connections[db_name[0]].update({"id": tweet["id"]}, {"$set": {"entities.urls": tweet["entities"]["urls"]}}) f.write(body + "\n") print " [x] inserted tweet ID %s" % tweet["id"] except ValueError, e: # print "tweet not processed: %s" % (line) error = {} error["error"] = str(e) error["tweet"] = body db.m_connections[db_name[1]].insert(error) print " [x] %s : %s" % (e, body) pass
def place_code_import(mongodb,sqldb): db = dbConnection() print mongodb,sqldb db.create_mongo_connections(mongo_options=[mongodb]) db.create_sql_connections(sql_options=[sqldb]) written_ids = open('written_ids_proposal.txt','w') #sql db query query = "select id,place,place_url from tweets where place != '' limit 2" db.sql_connections['boston'].execute(query) for x in db.sql_connections['boston']: query = str(x[0]) value1 = str(x[1]) value2 = str(x[2]) print query,value1,value2 written_ids.write('"%s","%s","%s"\n' % (query,value1,value2)) db.m_connections['new_boston'].update({'user.id':query}, {'$set':{ 'place.full_name':value1, 'place.url':value2, } })
def url_by_domain_counter(db_name,rumor): db = dbConnection() db.create_mongo_connections(mongo_options=[db_name]) title = "%s_top_domains.csv" % rumor.replace('/','_').replace(' ','_') fpath = utils.write_to_data(path=title) f = open(fpath, 'w') f.write('domain,total,misinfo hits,correction hits\n') count = Counter() raw_data = db.m_connections[db_name].find({ "counts.urls":{ "$gt":0 }, 'codes.rumor':rumor }) for data in raw_data: url = [j['domain'] for j in data['entities']['urls'] if 'domain' in j] count.update(url) for x in count.most_common(100): info_type = Counter() new_data = db.m_connections[db_name].find({ 'entities.urls.domain':x[0], 'codes.rumor':rumor }) for y in new_data: info_type.update([y['codes'][0]['code']]) misinfo = info_type['misinfo'] + info_type['speculation'] + info_type['hedge'] correction = info_type['correction'] + info_type['question'] result = '"%s","%s","%s","%s"\n' % (x[0],x[1],misinfo,correction) try: f.write(result) except: f.write('decode error!\n')
def __init__(self, db_name=""): self.db_name = db_name self.cache = dbConnection() self.cache.create_mongo_connections(mongo_options=[self.db_name]) # fix this
from datetime import datetime, timedelta import time from email.utils import parsedate_tz from pymongo import Connection import simplejson import re import hashlib import string from collections import defaultdict import MySQLdb from connection import dbConnection import pika # Connect to mongo db_names = ['gnip_boston','gnip_processing_errors'] db = dbConnection() db.create_mongo_connections(mongo_options=[db_name,error_log]) # List of punct to remove from string for track keyword matching punct = re.escape('!"$%&\'()*+,-./:;<=>?@[\\]^`{|}~') # keep track of line numbers #line_num = 0 tweet_total = 0 # Stuff 50k tweets into this sucker tweets_list = [] connection = pika.BlockingConnection(pika.ConnectionParameters( host='localhost')) channel = connection.channel()
def _dataset_from_rumor(rumor,db_name): db = dbConnection() db.create_mongo_connections(mongo_options=[db_name]) title = "%s_tweets.csv" % (rumor.replace('/','_')) fpath = utils.write_to_data(path=title) f = open(fpath, 'w') f.write('tweet_ID,text,author,time,retweeted,original_iD\n') terms = [] if rumor == 'girl running': terms.append(re.compile('girl ', re.IGNORECASE)) terms.append(re.compile('running', re.IGNORECASE)) raw_data = db.m_connections[db_name].find({ '$and':[ {'text':terms[0]}, {'text':terms[1]} ] }) elif rumor == 'craft/seals': terms.append(re.compile('blackwater', re.IGNORECASE)) terms.append(re.compile('craft', re.IGNORECASE)) terms.append(re.compile('security', re.IGNORECASE)) terms.append(re.compile('navy seal', re.IGNORECASE)) terms.append(re.compile('black ops', re.IGNORECASE)) raw_data = db.m_connections[db_name].find({ '$or':[ {'text':terms[0]}, {'$and':[{'text':terms[1]},{'text':terms[2]}]}, {'text':terms[3]}, {'text':terms[4]} ] }) elif rumor == 'sunil': terms.append(re.compile('sunil', re.IGNORECASE)) terms.append(re.compile('tripathi', re.IGNORECASE)) raw_data = db.m_connections[db_name].find({ '$or':[ {'text':terms[0]}, {'text':terms[1]} ] }) elif rumor == 'cell phone': terms.append(re.compile('cell ', re.IGNORECASE)) terms.append(re.compile('mobile', re.IGNORECASE)) terms.append(re.compile('phone', re.IGNORECASE)) terms.append(re.compile('wireless', re.IGNORECASE)) terms.append(re.compile('service', re.IGNORECASE)) terms.append(re.compile('network', re.IGNORECASE)) terms.append(re.compile('call our from this site', re.IGNORECASE)) raw_data = db.m_connections[db_name].find({ '$and':[ {'$or':[ {'text':terms[0]}, {'text':terms[1]}, {'text':terms[2]}, {'text':terms[3]} ]}, {'$or':[ {'text':terms[4]}, {'text':terms[5]} ]}, {'text':{'$not':terms[6]}} ] }) elif rumor == 'proposal': terms.append(re.compile('propos', re.IGNORECASE)) terms.append(re.compile('marry', re.IGNORECASE)) terms.append(re.compile('girl', re.IGNORECASE)) terms.append(re.compile('woman', re.IGNORECASE)) raw_data = db.m_connections[db_name].find({ '$and':[ {'$or':[ {'text':terms[0]}, {'text':terms[1]}, ]}, {'$or':[ {'text':terms[2]}, {'text':terms[3]} ]}, ] }) elif rumor == 'jfk': terms.append(re.compile('jfk', re.IGNORECASE)) terms.append(re.compile('bomb', re.IGNORECASE)) terms.append(re.compile('library', re.IGNORECASE)) raw_data = db.m_connections[db_name].find({ '$or':[ {'text':terms[0]}, {'$and':[ {'text':terms[1]}, {'text':terms[2]} ]} ] }) for x in raw_data: if 'retweeted_status' in x: retweeted = 1 original = x['retweeted_status']['id'] else: retweeted = 0 original = 0 result = '"%s","%s","%s","%s","%s","%s"\n' % (x['id'], x['text'], x['user']['screen_name'], x['created_ts'], retweeted, original) f.write(result.encode('utf-8'))