MYSQL_USER_NAME, MYSQL_PASSWORD ) db = Connection(host = HOST_NAME, database = "jb", user = MYSQL_USER_NAME, password = MYSQL_PASSWORD ) db2 = Connection(host = HOST_NAME, database = "jb_pure", user = "******", password = "******" ) tweets = db.query("SELECT * FROM tweets") count = 0 for t in tweets: c = t["tweet"].encode('utf-8') # language detection name, code, reliable, bytes_found, details = cld.detect(c) # compile a regex for urls. We don't want tweets with urls r = re.compile(r"(http://[^ ]+)") urlmatch = r.search(c) # we use a set to save tweets, and check against that to prevent duplicates saved = set() if (code == "en" or code == "un") and not urlmatch and c not in saved: # we allow 'unknown' languages into our database, as these are mostly short singlish sentences db2.execute("INSERT INTO tweets (user, tweet, location) VALUES (%s, %s, %s)", t["user"], c, t["location"]) saved.add(c) else:
res = [] for (words, loc) in tweets: words_filtered = [w.lower() for w in words.split()] words_filtered = [''.join(c for c in w if c not in string.punctuation) for w in words_filtered] words_filtered = filter(lambda x: x not in stopwords.words('english'), words_filtered) res.append((words_filtered, loc)) return res db = Connection(host = HOST_NAME, database = "jb_pure", user = "******", password = "******" ) sg_users = db.query("SELECT id FROM users WHERE country='SG'") jb_users = db.query("SELECT id FROM users WHERE country='JB'") sg_tweets, jb_tweets = [], [] mtn = re.compile("@\w+") hash = re.compile("#\w+") for u in sg_users: curr = db.query("SELECT tweet FROM tweets WHERE user=%s", u["id"]) curr = [hash.sub("", mtn.sub("", t['tweet'])).strip() for t in curr] # get rid of mentions [sg_tweets.append((t, "SG")) for t in curr] for u in jb_users: curr = db.query("SELECT tweet FROM tweets WHERE user=%s", u["id"]) curr = [hash.sub("", mtn.sub("", t['tweet'])).strip() for t in curr] [jb_tweets.append((t, "JB")) for t in curr]