コード例 #1
0
ファイル: db_update.py プロジェクト: emCOMP/boston
def geo_code_import(mongodb,sqldb):
    db = dbConnection()
    print mongodb,sqldb
    db.create_mongo_connections(mongo_options=[mongodb])
    db.create_sql_connections(sql_options=[sqldb])
    written_ids = open('written_ids_proposal.txt','w')

    #sql db query
    query = "select id,meta_gps_lat,meta_gps_long from tweets where meta_gps_lat != '' and meta_gps_long != ''"
    db.sql_connections['boston'].execute(query)
    for x in db.sql_connections['boston']:
        query = str(x[0])
        value1 = str(x[1])
        value2 = str(x[2])
        print query,value1,value2
        written_ids.write('"%s","%s","%s"\n' % (query,value1,value2))
        db.m_connections['new_boston'].update({'user.id':query},
                      {'$set':{
                          'place':{
                              'coordinates':{
                                  'type':'Point',
                                  'coordinates':[value2,value1]
                              }
                          }
                      }
                   })
コード例 #2
0
ファイル: main.py プロジェクト: jmaddock/url_expander
def load_tweets_from_mongo(db_name):
    db = dbConnection()
    db.create_mongo_connections(mongo_options=[db_name])

    tweets = db.m_connections.find({'counts.urls':{'$gte':1}})
    tweet_queue = [x for x in tweets]
    return tweet_queue
コード例 #3
0
ファイル: db_update.py プロジェクト: emCOMP/boston
def create_instersection_codes(db1,db2,gnip_first=True):
    db = dbConnection()
    db.create_mongo_connections(mongo_options=[db1])
    db.create_mongo_connections(mongo_options=[db2])

    raw_data = db.m_connections[db1].find()

    if gnip_first is True:
        for x in raw_data:
            print x['id']
            new_data = db.m_connections[db2].find_one({'id':str(x['id'])})
            if new_data != None:
                db.m_connections[db1].update({'id':long(x['id'])},
                                             {'$set':
                                              {'intersect':0}
                                          })
                db.m_connections[db2].update({'id':str(x['id'])},
                                             {'$set':
                                              {'intersect':0}
                                          })
            else:
                new_kw = ['watertown','mit']
                if (set(new_kw) & set(x['track_kw']['mentions'])) or (set(new_kw) & set(x['track_kw']['hashtags'])) or (set(new_kw) & set(x['track_kw']['text'])):
                        db.m_connections[db1].update({'id':long(x['id'])},
                                             {'$set':
                                              {'intersect':3}
                                          })
                else:
                    db.m_connections[db1].update({'id':long(x['id'])},
                                                 {'$set':
                                                  {'intersect':2}
                                              })
    else:
        for x in raw_data:
            print x['id']
            new_data = db.m_connections[db2].find_one({'id':long(x['id'])})
            if new_data != None:
                db.m_connections[db1].update({'id':str(x['id'])},
                                             {'$set':
                                              {'intersect':0}
                                          })
                db.m_connections[db2].update({'id':long(x['id'])},
                                             {'$set':
                                              {'intersect':0}
                                          })
            else:
                db.m_connections[db1].update({'id':str(x['id'])},
                                             {'$set':
                                              {'intersect':1}
                                          })
コード例 #4
0
ファイル: db_update.py プロジェクト: emCOMP/boston
def total_intersection(db1,db2):
    db = dbConnection()
    db.create_mongo_connections(mongo_options=[db1])
    db.create_mongo_connections(mongo_options=[db2])

    raw_data = db.m_connections[db1].find()
    count = 0

    for x in raw_data:
        new_data = db.m_connections[db2].find_one({'id':str(x['id'])})
        if new_data != None:
            count += 1

    result = '%s in %s: %s' % (db1,db2,count)
    print result
コード例 #5
0
ファイル: db_update.py プロジェクト: emCOMP/boston
def code_update_mongo_to_sql(mongodb,sqldb,table,rumor):
    db = dbConnection()
    print mongodb,sqldb
    db.create_mongo_connections(mongo_options=[mongodb])
    db.create_sql_connections(sql_options=[sqldb])
    written_ids = open('girl_running_update_log.txt','w')

    #sql db query
    query = "select id,code from %s" % table
    db.sql_connections[sqldb].execute(query)

    for x in db.sql_connections[sqldb].fetchall():
        query = str(x[0])
        value = str(x[1])
        print query,value
        written_ids.write('"%s","%s"\n' % (query,value))
        db.m_connections[mongodb].update({'user.id':query,
                                          'codes.rumor':rumor},
                                         {'$set':{'codes.$.code':value,}})
コード例 #6
0
ファイル: db_update.py プロジェクト: emCOMP/boston
def author_code_import(mongodb,sqldb):
    db = dbConnection()
    db.create_mongo_connections(mongo_options=[mongodb])
    db.create_sql_connections(sql_options=[sqldb])
    written_ids = open('written_ids_proposal.txt','w')

    #sql db query
    query = "select id,author from tweets"
    db.sql_connections['boston'].execute(query)
    for x in db.sql_connections[sqldb]:
        query = str(x[0])
        value1 = str(x[1])
        print query,value1
        written_ids.write('"%s","%s"\n' % (query,value1))
        db.m_connections[mongodb].update({'user.id':query},
                      {'$set':{
                          'user.screen_name':value1,
                          }
                      })
コード例 #7
0
ファイル: step_3.py プロジェクト: jmaddock/url_expander
def single_update(ch, method, properties, body):
    db_name = [config_info.tweet_db, config_info.processing_errors_db]
    db = dbConnection()
    db.create_mongo_connections(mongo_options=db_name)
    try:
        tweet = simplejson.loads(body)
        f = open(config_info.tweet_log, "a")
        tweet["created_ts"] = to_datetime(tweet["created_at"])
        tweet["user"]["created_ts"] = to_datetime(tweet["user"]["created_at"])
        db.m_connections[db_name[0]].update({"id": tweet["id"]}, {"$set": {"entities.urls": tweet["entities"]["urls"]}})
        f.write(body + "\n")
        print " [x] inserted tweet ID %s" % tweet["id"]

    except ValueError, e:
        # print "tweet not processed: %s" % (line)
        error = {}
        error["error"] = str(e)
        error["tweet"] = body
        db.m_connections[db_name[1]].insert(error)
        print " [x] %s : %s" % (e, body)
        pass
コード例 #8
0
ファイル: db_update.py プロジェクト: emCOMP/boston
def place_code_import(mongodb,sqldb):
    db = dbConnection()
    print mongodb,sqldb
    db.create_mongo_connections(mongo_options=[mongodb])
    db.create_sql_connections(sql_options=[sqldb])
    written_ids = open('written_ids_proposal.txt','w')

    #sql db query
    query = "select id,place,place_url from tweets where place != '' limit 2"
    db.sql_connections['boston'].execute(query)
    for x in db.sql_connections['boston']:
        query = str(x[0])
        value1 = str(x[1])
        value2 = str(x[2])
        print query,value1,value2
        written_ids.write('"%s","%s","%s"\n' % (query,value1,value2))
        db.m_connections['new_boston'].update({'user.id':query},
                      {'$set':{
                          'place.full_name':value1,
                          'place.url':value2,
                          }
                      })
コード例 #9
0
ファイル: url_graphs.py プロジェクト: emCOMP/boston
def url_by_domain_counter(db_name,rumor):
    db = dbConnection()
    db.create_mongo_connections(mongo_options=[db_name])

    title = "%s_top_domains.csv" % rumor.replace('/','_').replace(' ','_')
    fpath = utils.write_to_data(path=title)
    f = open(fpath, 'w')
    f.write('domain,total,misinfo hits,correction hits\n')

    count = Counter()
    raw_data = db.m_connections[db_name].find({
        "counts.urls":{
            "$gt":0
        },
        'codes.rumor':rumor
    })

    for data in raw_data:
        url = [j['domain'] for j in data['entities']['urls'] if 'domain' in j]
        count.update(url)

    for x in count.most_common(100):
        info_type = Counter()
        new_data = db.m_connections[db_name].find({
            'entities.urls.domain':x[0],
            'codes.rumor':rumor
        })
        for y in new_data:
            info_type.update([y['codes'][0]['code']])
        misinfo = info_type['misinfo'] + info_type['speculation'] + info_type['hedge']
        correction = info_type['correction'] + info_type['question']
        result = '"%s","%s","%s","%s"\n' % (x[0],x[1],misinfo,correction)
        try:
            f.write(result)
        except:
            f.write('decode error!\n')
コード例 #10
0
ファイル: expand_url.py プロジェクト: jmaddock/url_expander
 def __init__(self, db_name=""):
     self.db_name = db_name
     self.cache = dbConnection()
     self.cache.create_mongo_connections(mongo_options=[self.db_name])  # fix this
コード例 #11
0
ファイル: mongo_insert.py プロジェクト: jmaddock/url_expander
from datetime import datetime, timedelta
import time
from email.utils import parsedate_tz
from pymongo import Connection
import simplejson
import re
import hashlib
import string
from collections import defaultdict
import MySQLdb
from connection import dbConnection
import pika

# Connect to mongo
db_names = ['gnip_boston','gnip_processing_errors']
db = dbConnection()
db.create_mongo_connections(mongo_options=[db_name,error_log])

# List of punct to remove from string for track keyword matching
punct = re.escape('!"$%&\'()*+,-./:;<=>?@[\\]^`{|}~')

# keep track of line numbers
#line_num = 0
tweet_total = 0

# Stuff 50k tweets into this sucker
tweets_list = []

connection = pika.BlockingConnection(pika.ConnectionParameters(
        host='localhost'))
channel = connection.channel()
コード例 #12
0
ファイル: create_dataset.py プロジェクト: emCOMP/boston
def _dataset_from_rumor(rumor,db_name):
    db = dbConnection()
    db.create_mongo_connections(mongo_options=[db_name])

    title = "%s_tweets.csv" % (rumor.replace('/','_'))
    fpath = utils.write_to_data(path=title)
    f = open(fpath, 'w')
    f.write('tweet_ID,text,author,time,retweeted,original_iD\n')
    terms = []

    if rumor == 'girl running':
        terms.append(re.compile('girl ', re.IGNORECASE))
        terms.append(re.compile('running', re.IGNORECASE))
        raw_data = db.m_connections[db_name].find({
            '$and':[
                {'text':terms[0]},
                {'text':terms[1]}
            ]
        })
    elif rumor == 'craft/seals':
        terms.append(re.compile('blackwater', re.IGNORECASE))
        terms.append(re.compile('craft', re.IGNORECASE))
        terms.append(re.compile('security', re.IGNORECASE))
        terms.append(re.compile('navy seal', re.IGNORECASE))
        terms.append(re.compile('black ops', re.IGNORECASE))
        raw_data = db.m_connections[db_name].find({
            '$or':[
                {'text':terms[0]},
                {'$and':[{'text':terms[1]},{'text':terms[2]}]},
                {'text':terms[3]},
                {'text':terms[4]}
            ]
        })
    elif rumor == 'sunil':
        terms.append(re.compile('sunil', re.IGNORECASE))
        terms.append(re.compile('tripathi', re.IGNORECASE))
        raw_data = db.m_connections[db_name].find({
            '$or':[
                {'text':terms[0]},
                {'text':terms[1]}
            ]
        })
    elif rumor == 'cell phone':
        terms.append(re.compile('cell ', re.IGNORECASE))
        terms.append(re.compile('mobile', re.IGNORECASE))
        terms.append(re.compile('phone', re.IGNORECASE))
        terms.append(re.compile('wireless', re.IGNORECASE))
        terms.append(re.compile('service', re.IGNORECASE))
        terms.append(re.compile('network', re.IGNORECASE))
        terms.append(re.compile('call our from this site', re.IGNORECASE))
        raw_data = db.m_connections[db_name].find({
            '$and':[
                {'$or':[
                    {'text':terms[0]},
                    {'text':terms[1]},
                    {'text':terms[2]},
                    {'text':terms[3]}
                ]},
                {'$or':[
                    {'text':terms[4]},
                    {'text':terms[5]}
                ]},
                {'text':{'$not':terms[6]}}
            ]
        })
    elif rumor == 'proposal':
        terms.append(re.compile('propos', re.IGNORECASE))
        terms.append(re.compile('marry', re.IGNORECASE))
        terms.append(re.compile('girl', re.IGNORECASE))
        terms.append(re.compile('woman', re.IGNORECASE))
        raw_data = db.m_connections[db_name].find({
            '$and':[
                {'$or':[
                    {'text':terms[0]},
                    {'text':terms[1]},
                ]},
                {'$or':[
                    {'text':terms[2]},
                    {'text':terms[3]}
                ]},
            ]
        })
    elif rumor == 'jfk':
        terms.append(re.compile('jfk', re.IGNORECASE))
        terms.append(re.compile('bomb', re.IGNORECASE))
        terms.append(re.compile('library', re.IGNORECASE))
        raw_data = db.m_connections[db_name].find({
            '$or':[
                    {'text':terms[0]},
                {'$and':[
                    {'text':terms[1]},
                    {'text':terms[2]}
                ]}
            ]
        })

    for x in raw_data:
        if 'retweeted_status' in x:
            retweeted = 1
            original = x['retweeted_status']['id']
        else:
            retweeted = 0
            original = 0

        result = '"%s","%s","%s","%s","%s","%s"\n' % (x['id'],
                                                      x['text'],
                                                      x['user']['screen_name'],
                                                      x['created_ts'],
                                                      retweeted,
                                                      original)
        f.write(result.encode('utf-8'))