#note that coordinate pairs are long/lat not lat/long #Singapore coordinates locations = ["103.61,1.22","104.01356,1.456674"] #Johor coordinates #locations = ["103.55,1.45", "103.87,1.63"] with tweetstream.FilterStream( SECOND_USERNAME, SECOND_PASSWORD, locations=locations) as stream: for tweet in stream: db = Connection(host = HOST_NAME, database = SG_MYSQL_DB_NAME, user = SG_MYSQL_USER_NAME, password = SG_MYSQL_PASSWORD ) username = tweet["user"]["screen_name"] text = tweet["text"] loc = tweet["place"]["full_name"] user = db.get("SELECT id FROM users WHERE username=%s", username) if user: db.execute("INSERT into tweets (user, tweet, location) VALUES (%s, %s, %s)", user["id"], text, loc) else: db.execute("INSERT into users (username) VALUES (%s)", username) user = db.get("SELECT id FROM users WHERE username=%s", username) db.execute("INSERT into tweets (user, tweet, location) VALUES (%s, %s, %s)", user["id"], text, loc) db.close()
import cld import re from sgbeat.database import Connection from details import ( USERNAME, PASSWORD, HOST_NAME, MYSQL_USER_NAME, MYSQL_PASSWORD ) db = Connection(host = HOST_NAME, database = "jb", user = MYSQL_USER_NAME, password = MYSQL_PASSWORD ) db2 = Connection(host = HOST_NAME, database = "jb_pure", user = "******", password = "******" ) tweets = db.query("SELECT * FROM tweets") count = 0 for t in tweets: c = t["tweet"].encode('utf-8') # language detection name, code, reliable, bytes_found, details = cld.detect(c) # compile a regex for urls. We don't want tweets with urls r = re.compile(r"(http://[^ ]+)") urlmatch = r.search(c)
def filter_words(tweets): """Get rid of stop words """ res = [] for (words, loc) in tweets: words_filtered = [w.lower() for w in words.split()] words_filtered = [''.join(c for c in w if c not in string.punctuation) for w in words_filtered] words_filtered = filter(lambda x: x not in stopwords.words('english'), words_filtered) res.append((words_filtered, loc)) return res db = Connection(host = HOST_NAME, database = "jb_pure", user = "******", password = "******" ) sg_users = db.query("SELECT id FROM users WHERE country='SG'") jb_users = db.query("SELECT id FROM users WHERE country='JB'") sg_tweets, jb_tweets = [], [] mtn = re.compile("@\w+") hash = re.compile("#\w+") for u in sg_users: curr = db.query("SELECT tweet FROM tweets WHERE user=%s", u["id"]) curr = [hash.sub("", mtn.sub("", t['tweet'])).strip() for t in curr] # get rid of mentions [sg_tweets.append((t, "SG")) for t in curr] for u in jb_users: