with tweetstream.FilterStream( SECOND_USERNAME, SECOND_PASSWORD, locations=locations) as stream: for tweet in stream: db = Connection(host = HOST_NAME, database = SG_MYSQL_DB_NAME, user = SG_MYSQL_USER_NAME, password = SG_MYSQL_PASSWORD ) username = tweet["user"]["screen_name"] text = tweet["text"] loc = tweet["place"]["full_name"] user = db.get("SELECT id FROM users WHERE username=%s", username) if user: db.execute("INSERT into tweets (user, tweet, location) VALUES (%s, %s, %s)", user["id"], text, loc) else: db.execute("INSERT into users (username) VALUES (%s)", username) user = db.get("SELECT id FROM users WHERE username=%s", username) db.execute("INSERT into tweets (user, tweet, location) VALUES (%s, %s, %s)", user["id"], text, loc) db.close() #print "===================" #print text #print username #print "(%s)" % loc
db2 = Connection(host = HOST_NAME, database = "jb_pure", user = "******", password = "******" ) tweets = db.query("SELECT * FROM tweets") count = 0 for t in tweets: c = t["tweet"].encode('utf-8') # language detection name, code, reliable, bytes_found, details = cld.detect(c) # compile a regex for urls. We don't want tweets with urls r = re.compile(r"(http://[^ ]+)") urlmatch = r.search(c) # we use a set to save tweets, and check against that to prevent duplicates saved = set() if (code == "en" or code == "un") and not urlmatch and c not in saved: # we allow 'unknown' languages into our database, as these are mostly short singlish sentences db2.execute("INSERT INTO tweets (user, tweet, location) VALUES (%s, %s, %s)", t["user"], c, t["location"]) saved.add(c) else: print "Not English: " + c + " lang:" + name count = count + 1 db.close() db2.close() print "Not English: " print count
database = "sgb_pure", user = "******", password = "******" ) tweets = db.query("SELECT * FROM tweets") count = 0 for t in tweets: c = t["tweet"].encode('utf-8') # language detection name, code, reliable, bytes_found, details = cld.detect(c) # compile a regex for urls. We don't want tweets with urls r = re.compile(r"(http://[^ ]+)") urlmatch = r.search(c) # we use a set to save tweets, and check against that to prevent duplicates saved = set() if (code == "en" or code == "un") and not urlmatch and c not in saved: # we allow 'unknown' languages into our database, as these are mostly short singlish sentences db2.execute("INSERT INTO tweets (user, tweet) VALUES (%s, %s)", t["user"], c) saved.add(c) else: print "Not English: " + c + " lang:" + name count = count + 1 db.close() db2.close() print "Not English: " print count