def im_webpage(srcs): """ Import web pages from file to database. """ # Connect to MySQL database warnings.simplefilter("error", MySQLdb.Warning) cur = CONN_POOL.get_cur(GEOTWEET) i, k = 0, 0 for line in fileinput.input(srcs, openhook=fileinput.hook_compressed): try: k += 1 tjson = json.loads(line) text = tjson['web'] if not isreadable(text): print text[:80].encode('cp1252', errors='ignore') continue item = (tjson['place_id'], \ html_filter(text).encode('utf-8', errors='ignore')) cur.execute( 'INSERT INTO web ( \ place_id, \ web) \ VALUES(%s,%s)', item) i += 1 except StandardError: print 'Fail at line {0}'.format(k) print traceback.print_exc(file=sys.stdout) logging.info('Import web pages::{0} out of {1} imported.'.format(i, k)) logging.info('------------------------------------------')
def filter_tweet(): """get rid of square game text""" scur = CONN_POOL.get_cur(GEOTWEET) dcur = CONN_POOL.get_cur(GEOTWEET) scur.execute('select id, text from tweet') i, k = 0, 0 for tweet in scur: i += 1 if len(get_tokens(tweet['text'])) > 0: dcur.execute( 'insert into `sample` \ select * from `tweet`\ where `tweet`.`id` = %s', tweet['id']) k += 1 logging.info('{0} out of {1} tweets are transferred'.format(k, i))
def place_name(pid, dbconf=GEOTWEET): """Return place name given a pid""" if IDPTN.match(pid) is None: return pid cur = CONN_POOL.get_cur(dbconf) cur.execute("select name from place where id=%s", (pid,)) return cur.fetchone()['name']
def im_webpage(srcs): """ Import web pages from file to database. """ # Connect to MySQL database warnings.simplefilter("error", MySQLdb.Warning) cur = CONN_POOL.get_cur(GEOTWEET) i, k = 0, 0 for line in fileinput.input(srcs, openhook=fileinput.hook_compressed): try: k += 1 tjson = json.loads(line) text = tjson["web"] if not isreadable(text): print text[:80].encode("cp1252", errors="ignore") continue item = (tjson["place_id"], html_filter(text).encode("utf-8", errors="ignore")) cur.execute( "INSERT INTO web ( \ place_id, \ web) \ VALUES(%s,%s)", item, ) i += 1 except StandardError: print "Fail at line {0}".format(k) print traceback.print_exc(file=sys.stdout) logging.info("Import web pages::{0} out of {1} imported.".format(i, k)) logging.info("------------------------------------------")
def sparsitysetup(nums): """ This setup considers the tweets from the places in the list and select some number of tweets from those places as testing tweets, the query is a block of tweets @arg city the place_id of the city @arg num the number of tweets generated @return a list() of tuple (text, cadidates) """ lsts = linestyles() for num in nums: with open('chicago10.lst') as fin: twt = Dataset() places = [p.strip() for p in fin] lmplc = dict() lmtwt = Dataset() for pid in places: cur = CONN_POOL.get_cur(GEOTWEET) cur.execute('select text from sample' \ ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 160)) text = [row['text'] for row in cur] lmplc[pid] = lmfromtext(text[:num]) for txt in text[150:160]: lmtwt.append({'pid': pid, 'lm': lmfromtext([txt,])}) ranks = list() for item in lmtwt: ranks.append(ranke(lmplc, item['lm'])) gch = batcheval(lmtwt['pid'], len(places), ranks) plt.plot(gch['pos'], gch['rate'], lsts.next(), label='t={0}'.format(num)) plt.xlabel('First $n$ places') plt.ylabel('Probability') plt.legend(loc='lower right') plt.show()
def confusionmatrix(places): """ Show the matrix of confusion between LMs by KL-divergence """ lmtwt1 = dict() lmtwt2 = dict() for pid in places: cur = CONN_POOL.get_cur(GEOTWEET) cur.execute('select text from sample' \ ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 200)) text = [row['text'] for row in cur] lmtwt1[pid] = lmfromtext(text[:80]) lmtwt2[pid] = lmfromtext(text[81:160]) confmat = list() for lm_i in places: confmat.append( [kl_divergence(lmtwt1[lm_i], lmtwt2[lm_j]) for lm_j in places]) selfavg = sum([confmat[i][i] for i in range(len(places))]) mutavg = sum([sum(confmat[i]) for i in range(len(places))]) - selfavg selfavg /= float(len(places)) mutavg /= float(len(places) * len(places) - len(places)) print selfavg, mutavg plt.imshow(np.array(confmat), cmap=cm.gray, interpolation='nearest') plt.yticks(range(len(places)), \ ['{0}: {1}'.format(place_name(places[i]), i) for i in range(len(places))]) plt.xticks(range(len(places))) plt.subplots_adjust(left=0.4) plt.colorbar(shrink=0.66) plt.savefig('sf_confm.eps') plt.show()
def confusionmatrix(places): """ Show the matrix of confusion between LMs by KL-divergence """ lmtwt1 = dict() lmtwt2 = dict() for pid in places: cur = CONN_POOL.get_cur(GEOTWEET) cur.execute('select text from sample' \ ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 200)) text = [row['text'] for row in cur] lmtwt1[pid] = lmfromtext(text[:80]) lmtwt2[pid] = lmfromtext(text[81:160]) confmat = list() for lm_i in places: confmat.append([kl_divergence(lmtwt1[lm_i], lmtwt2[lm_j]) for lm_j in places]) selfavg = sum([confmat[i][i] for i in range(len(places))]) mutavg = sum([sum(confmat[i]) for i in range(len(places))]) - selfavg selfavg /= float(len(places)) mutavg /= float(len(places)*len(places) - len(places)) print selfavg, mutavg plt.imshow(np.array(confmat), cmap = cm.gray, interpolation='nearest') plt.yticks(range(len(places)), \ ['{0}: {1}'.format(place_name(places[i]), i) for i in range(len(places))]) plt.xticks(range(len(places))) plt.subplots_adjust(left=0.4) plt.colorbar(shrink=0.66) plt.savefig('sf_confm.eps') plt.show()
def tweet_count(dbconf, table): """get count of tweets in database""" cur = CONN_POOL.get_cur(dbconf) cur.execute('select count(*) as cnt from {0}'.format(table)) row = cur.fetchone() print 'Count:{0}'.format(row['cnt']) return row['cnt']
def qloadrows(config, query): """Load tweets to list on conditions""" cur = CONN_POOL.get_cur(config) print query cur.execute(query) print 'Count: {0}'.format(cur.rowcount) return Dataset().extend([row for row in cur])
def place_name(pid, dbconf=GEOTWEET): """Return place name given a pid""" if IDPTN.match(pid) is None: return pid cur = CONN_POOL.get_cur(dbconf) cur.execute("select name from place where id=%s", (pid, )) return cur.fetchone()['name']
def qloadrows(config, query): """Load tweets to list on conditions""" cur = CONN_POOL.get_cur(config) print query cur.execute(query) print 'Count: {0}'.format(cur.rowcount) return Dataset().extend([row for row in cur])
def admin_dist(dbconf, pid): """get distribution of tweets in admin level""" cur = CONN_POOL.get_cur(dbconf) cur.execute("select id from place where superior_id=%s", pid) plcs = set() dist = dict() for row in cur: plcs = sub_place(row['id'], dbconf) dist[row['id']] = dict({'poi': len(plcs)}) curx = CONN_POOL.get_cur(dbconf) for plc in plcs: curx.execute("select count(id) as cnt from sample where place_id=%s", plc) if 'cnt' not in dist[row['id']]: dist[row['id']]['cnt'] = curx.fetchone()['cnt'] else: dist[row['id']]['cnt'] += curx.fetchone()['cnt'] return dist
def im_place(srcs): """ Import places from file to database. """ # Connect to MySQL database cur = CONN_POOL.get_cur(GEOTWEET) k, i = 0, 0 fin = fileinput.FileInput(openhook=fileinput.hook_compressed) for line in fin.input(srcs): try: tjson = json.loads(line) k += 1 lat = 0 lng = 0 if tjson["place_type"] != "country": lat = tjson["bounding_box"]["coordinates"][0][0][1] lng = tjson["bounding_box"]["coordinates"][0][0][0] item = ( tjson["id"], tjson["name"], tjson["place_type"], tjson["contained_within"][0]["id"], tjson["contained_within"][0]["name"], tjson["contained_within"][0]["place_type"], lat, lng, tjson["country_code"], ) else: item = (tjson["id"], tjson["name"], None, None, None, None, None, tjson["country_code"]) cur.execute( "INSERT INTO place (" "`id`, " "`name`, " "`type`, " "`superior_id`, " "`superior_name`, " "`superior_type`, " "`lat`, " "`lng`, " "`country`, " "`geo`)" "VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s," "GeomFromText('Point({0} {1})'))".format(lat, lng), item, ) cur.execute("INSERT INTO place_json (id, json) VALUES(%s,%s)", (tjson["id"], line)) i += 1 except _mysql_exceptions.IntegrityError: print "Import Places::Place ID {0} ignored for duplication.".format(tjson["id"]) except StandardError: logging.error("Fail at line {0}".format(k)) logging.info("Import Places::{0} out of {1} imported.".format(i, k)) logging.info("------------------------------------------")
def filter_tweet(): """get rid of square game text""" scur = CONN_POOL.get_cur(GEOTWEET) dcur = CONN_POOL.get_cur(GEOTWEET) scur.execute("select id, text from tweet") i, k = 0, 0 for tweet in scur: i += 1 if len(get_tokens(tweet["text"])) > 0: dcur.execute( "insert into `sample` \ select * from `tweet`\ where `tweet`.`id` = %s", tweet["id"], ) k += 1 logging.info("{0} out of {1} tweets are transferred".format(k, i))
def sub_place(pid, dbconf): """return a list of place contained within id""" cur = CONN_POOL.get_cur(dbconf) cur.execute("select id from place where superior_id=%s", pid) plc = set() for sub_pid in cur: plc.add(sub_pid['id']) plc.update(sub_place(sub_pid['id'], dbconf)) return plc
def im_place_genre(srcs): """Update the genre of a place""" cur = CONN_POOL.get_cur(GEOTWEET) k = 0 fi = fileinput.FileInput(openhook=fileinput.hook_compressed) for line in fi.input(srcs): k += 1 try: rst = json.loads(line) pid = rst["t_place_id"] cur.execute( r"insert into foursquare_json(id, json) " "values(%s,%s)", (str(pid), json.dumps(rst["response"]["groups"])), ) except _mysql_exceptions.IntegrityError: print "Import place genre::Place {0} ignored for duplication.".format(rst["t_place_id"]) except StandardError as err: logging.error("Fail at line {0}, {1}".format(k, err.message)) CONN_POOL.get_conn(GEOTWEET).commit()
def foursq_count(dbconf, table): """get count that contains http://4sq.com""" foursq_pattern = re.compile(r'http://4sq.com') cur = CONN_POOL.get_cur(dbconf) cur.execute('select text from {0}'.format(table)) k = 0 for row in cur: if foursq_pattern.search(row['text']) != None: k += 1 print 'Foursqure.com tweet count:{0}'.format(k) return k
def im_place_genre(srcs): """Update the genre of a place""" cur = CONN_POOL.get_cur(GEOTWEET) k = 0 fi = fileinput.FileInput(openhook=fileinput.hook_compressed) for line in fi.input(srcs): k += 1 try: rst = json.loads(line) pid = rst['t_place_id'] cur.execute(r'insert into foursquare_json(id, json) ' 'values(%s,%s)', \ (str(pid), json.dumps(rst['response']['groups']))) except _mysql_exceptions.IntegrityError: print 'Import place genre::Place {0} ignored for duplication.'\ .format(rst['t_place_id']) except StandardError as err: logging.error('Fail at line {0}, {1}'.format(k, err.message)) CONN_POOL.get_conn(GEOTWEET).commit()
def im_tweet(srcs): """ Import tweet from file to database. """ # Connect to MySQL database cur = CONN_POOL.get_cur(GEOTWEET) i = 0 k = 0 for line in fileinput.input(srcs, openhook=fileinput.hook_compressed): try: tjson = json.loads(line) lat = tjson['place']['bounding_box'] \ ['coordinates'][0][0][1] lng = tjson['place']['bounding_box'] \ ['coordinates'][0][0][0] timestr = tjson['created_at'] timestru = time.strptime(timestr, '%a %b %d %H:%M:%S +0000 %Y') #Wed Apr 14 18:51:32 +0000 2010 timex = time.strftime('%Y-%m-%d %H:%M:%S', timestru) item = (tjson['id'], \ tjson['place']['id'], \ tjson['user']['id'], \ tjson['text'], \ lat, \ lng, \ timex) k += 1 if len(get_tokens(tjson['text'])) > 0: cur.execute('INSERT INTO sample (' 'id, ' 'place_id, ' 'user_id, ' 'text, ' 'lat, ' 'lng, ' 'geo, ' 'created_at) ' 'VALUES(%s,%s,%s,%s,%s,%s,' 'GeomFromText(\'POINT({0} {1})\'),%s)'. \ format(lat, lng), item) #cur.execute('INSERT INTO tweet_json(id, json) VALUES(%s,%s)', #(tjson['id'], line)) i += 1 except _mysql_exceptions.IntegrityError: print 'Import Tweets::Tweet ID {0} ignored for duplication.'\ .format(tjson['id']) except StandardError: print 'Fail at line {0}'.format(k) logging.info('Import Tweet::{0} out of {1} imported.'.format(i, k)) logging.info('------------------------------------------')
def loadrows(config, cols, wheres=None, table='sample', other=''): """Load tweets to list on conditions""" query = 'SELECT ' + \ ((', '.join(cols)) if cols!='*' else '*') \ + ' FROM ' + table + \ ((' WHERE ' + ' AND '.join(wheres)) if wheres else '') \ + ' ' + other cur = CONN_POOL.get_cur(config) print query cur.execute(query) res = Dataset() for row in cur: twt = DataItem() for key in cols: twt[key] = row[key] res.append(twt) print 'Count: {0}'.format(cur.rowcount) return res
def loadrows(config, cols, wheres=None, table='sample', other=''): """Load tweets to list on conditions""" query = 'SELECT ' + \ ((', '.join(cols)) if cols!='*' else '*') \ + ' FROM ' + table + \ ((' WHERE ' + ' AND '.join(wheres)) if wheres else '') \ + ' ' + other cur = CONN_POOL.get_cur(config) print query cur.execute(query) res = Dataset() for row in cur: twt = DataItem() for key in cols: twt[key] = row[key] res.append(twt) print 'Count: {0}'.format(cur.rowcount) return res
def im_tweet(srcs): """ Import tweet from file to database. """ # Connect to MySQL database cur = CONN_POOL.get_cur(GEOTWEET) i = 0 k = 0 for line in fileinput.input(srcs, openhook=fileinput.hook_compressed): try: tjson = json.loads(line) lat = tjson["place"]["bounding_box"]["coordinates"][0][0][1] lng = tjson["place"]["bounding_box"]["coordinates"][0][0][0] timestr = tjson["created_at"] timestru = time.strptime(timestr, "%a %b %d %H:%M:%S +0000 %Y") # Wed Apr 14 18:51:32 +0000 2010 timex = time.strftime("%Y-%m-%d %H:%M:%S", timestru) item = (tjson["id"], tjson["place"]["id"], tjson["user"]["id"], tjson["text"], lat, lng, timex) k += 1 if len(get_tokens(tjson["text"])) > 0: cur.execute( "INSERT INTO sample (" "id, " "place_id, " "user_id, " "text, " "lat, " "lng, " "geo, " "created_at) " "VALUES(%s,%s,%s,%s,%s,%s," "GeomFromText('POINT({0} {1})'),%s)".format(lat, lng), item, ) # cur.execute('INSERT INTO tweet_json(id, json) VALUES(%s,%s)', # (tjson['id'], line)) i += 1 except _mysql_exceptions.IntegrityError: print "Import Tweets::Tweet ID {0} ignored for duplication.".format(tjson["id"]) except StandardError: print "Fail at line {0}".format(k) logging.info("Import Tweet::{0} out of {1} imported.".format(i, k)) logging.info("------------------------------------------")
def sparsitysetup(nums): """ This setup considers the tweets from the places in the list and select some number of tweets from those places as testing tweets, the query is a block of tweets @arg city the place_id of the city @arg num the number of tweets generated @return a list() of tuple (text, cadidates) """ lsts = linestyles() for num in nums: with open('chicago10.lst') as fin: twt = Dataset() places = [p.strip() for p in fin] lmplc = dict() lmtwt = Dataset() for pid in places: cur = CONN_POOL.get_cur(GEOTWEET) cur.execute('select text from sample' \ ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 160)) text = [row['text'] for row in cur] lmplc[pid] = lmfromtext(text[:num]) for txt in text[150:160]: lmtwt.append({ 'pid': pid, 'lm': lmfromtext([ txt, ]) }) ranks = list() for item in lmtwt: ranks.append(ranke(lmplc, item['lm'])) gch = batcheval(lmtwt['pid'], len(places), ranks) plt.plot(gch['pos'], gch['rate'], lsts.next(), label='t={0}'.format(num)) plt.xlabel('First $n$ places') plt.ylabel('Probability') plt.legend(loc='lower right') plt.show()
def im_place(srcs): """ Import places from file to database. """ # Connect to MySQL database cur = CONN_POOL.get_cur(GEOTWEET) k, i = 0, 0 fin = fileinput.FileInput(openhook=fileinput.hook_compressed) for line in fin.input(srcs): try: tjson = json.loads(line) k += 1 lat = 0 lng = 0 if tjson['place_type'] != 'country': lat = tjson['bounding_box'] \ ['coordinates'][0][0][1] lng = tjson['bounding_box'] \ ['coordinates'][0][0][0] item = (tjson['id'], \ tjson['name'], \ tjson['place_type'], \ tjson['contained_within'][0]['id'], \ tjson['contained_within'][0]['name'], \ tjson['contained_within'][0]['place_type'], \ lat, \ lng, \ tjson['country_code']) else: item = (tjson['id'], \ tjson['name'], \ None, None, None, None, None, tjson['country_code']) cur.execute('INSERT INTO place (' '`id`, ' '`name`, ' '`type`, ' '`superior_id`, ' '`superior_name`, ' '`superior_type`, ' '`lat`, ' '`lng`, ' '`country`, ' '`geo`)' 'VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,' 'GeomFromText(\'Point({0} {1})\'))'.\ format(lat, lng), item) cur.execute('INSERT INTO place_json (id, json) VALUES(%s,%s)', \ (tjson['id'], line)) i += 1 except _mysql_exceptions.IntegrityError: print 'Import Places::Place ID {0} ignored for duplication.'.format( tjson['id']) except StandardError: logging.error('Fail at line {0}'.format(k)) logging.info('Import Places::{0} out of {1} imported.'.format(i, k)) logging.info('------------------------------------------')