def confusionmatrix(places): """ Show the matrix of confusion between LMs by KL-divergence """ lmtwt1 = dict() lmtwt2 = dict() for pid in places: cur = CONN_POOL.get_cur(GEOTWEET) cur.execute('select text from sample' \ ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 200)) text = [row['text'] for row in cur] lmtwt1[pid] = lmfromtext(text[:80]) lmtwt2[pid] = lmfromtext(text[81:160]) confmat = list() for lm_i in places: confmat.append( [kl_divergence(lmtwt1[lm_i], lmtwt2[lm_j]) for lm_j in places]) selfavg = sum([confmat[i][i] for i in range(len(places))]) mutavg = sum([sum(confmat[i]) for i in range(len(places))]) - selfavg selfavg /= float(len(places)) mutavg /= float(len(places) * len(places) - len(places)) print selfavg, mutavg plt.imshow(np.array(confmat), cmap=cm.gray, interpolation='nearest') plt.yticks(range(len(places)), \ ['{0}: {1}'.format(place_name(places[i]), i) for i in range(len(places))]) plt.xticks(range(len(places))) plt.subplots_adjust(left=0.4) plt.colorbar(shrink=0.66) plt.savefig('sf_confm.eps') plt.show()
def confusionmatrix(places): """ Show the matrix of confusion between LMs by KL-divergence """ lmtwt1 = dict() lmtwt2 = dict() for pid in places: cur = CONN_POOL.get_cur(GEOTWEET) cur.execute('select text from sample' \ ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 200)) text = [row['text'] for row in cur] lmtwt1[pid] = lmfromtext(text[:80]) lmtwt2[pid] = lmfromtext(text[81:160]) confmat = list() for lm_i in places: confmat.append([kl_divergence(lmtwt1[lm_i], lmtwt2[lm_j]) for lm_j in places]) selfavg = sum([confmat[i][i] for i in range(len(places))]) mutavg = sum([sum(confmat[i]) for i in range(len(places))]) - selfavg selfavg /= float(len(places)) mutavg /= float(len(places)*len(places) - len(places)) print selfavg, mutavg plt.imshow(np.array(confmat), cmap = cm.gray, interpolation='nearest') plt.yticks(range(len(places)), \ ['{0}: {1}'.format(place_name(places[i]), i) for i in range(len(places))]) plt.xticks(range(len(places))) plt.subplots_adjust(left=0.4) plt.colorbar(shrink=0.66) plt.savefig('sf_confm.eps') plt.show()
def kldiff(places): """ compare the difference of kl-divergence between tweets and web pages for each place in places """ diff = Dataset() for pid in places: twt = loadrows(GEOTWEET, ('place_id', 'text'), ('place_id=\'{0}\''.format(pid),), 'sample', 'order by rand() limit {0}'.format(100)) web = loadrows(GEOTWEET, ('place_id', 'web'), ('place_id=\'{0}\''.format(pid),), 'web', 'limit 25') lmref = lmfromtext(twt['text'][:50]) lmtwt = lmfromtext(twt['text'][51:]) lmweb = lmfromtext(web['web']) diff.append({'pid': pid, 'twtkld': kl_divergence(lmtwt, lmref), 'webkld': kl_divergence(lmweb, lmref)}) for item in diff: print '{0} & {1} & {2}'.format(place_name(item['pid']), item['twtkld'], item['webkld'])
def kldiff(places): """ compare the difference of kl-divergence between tweets and web pages for each place in places """ diff = Dataset() for pid in places: twt = loadrows(GEOTWEET, ('place_id', 'text'), ('place_id=\'{0}\''.format(pid), ), 'sample', 'order by rand() limit {0}'.format(100)) web = loadrows(GEOTWEET, ('place_id', 'web'), ('place_id=\'{0}\''.format(pid), ), 'web', 'limit 25') lmref = lmfromtext(twt['text'][:50]) lmtwt = lmfromtext(twt['text'][51:]) lmweb = lmfromtext(web['web']) diff.append({ 'pid': pid, 'twtkld': kl_divergence(lmtwt, lmref), 'webkld': kl_divergence(lmweb, lmref) }) for item in diff: print '{0} & {1} & {2}'.format(place_name(item['pid']), item['twtkld'], item['webkld'])