Exemple #1
0
def confusionmatrix(places):
    """ Show the matrix of confusion between LMs by KL-divergence
    """
    lmtwt1 = dict()
    lmtwt2 = dict()
    for pid in places:
        cur = CONN_POOL.get_cur(GEOTWEET)
        cur.execute('select text from sample' \
                ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 200))
        text = [row['text'] for row in cur]
        lmtwt1[pid] = lmfromtext(text[:80])
        lmtwt2[pid] = lmfromtext(text[81:160])
    confmat = list()
    for lm_i in places:
        confmat.append(
            [kl_divergence(lmtwt1[lm_i], lmtwt2[lm_j]) for lm_j in places])

    selfavg = sum([confmat[i][i] for i in range(len(places))])
    mutavg = sum([sum(confmat[i]) for i in range(len(places))]) - selfavg
    selfavg /= float(len(places))
    mutavg /= float(len(places) * len(places) - len(places))
    print selfavg, mutavg

    plt.imshow(np.array(confmat), cmap=cm.gray, interpolation='nearest')
    plt.yticks(range(len(places)), \
            ['{0}: {1}'.format(place_name(places[i]), i) for i in range(len(places))])
    plt.xticks(range(len(places)))
    plt.subplots_adjust(left=0.4)
    plt.colorbar(shrink=0.66)
    plt.savefig('sf_confm.eps')
    plt.show()
Exemple #2
0
def confusionmatrix(places):
    """ Show the matrix of confusion between LMs by KL-divergence
    """
    lmtwt1 = dict()
    lmtwt2 = dict()
    for pid in places:
        cur = CONN_POOL.get_cur(GEOTWEET)
        cur.execute('select text from sample' \
                ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 200))
        text = [row['text'] for row in cur]
        lmtwt1[pid] = lmfromtext(text[:80])
        lmtwt2[pid] = lmfromtext(text[81:160])
    confmat = list()
    for lm_i in places:
        confmat.append([kl_divergence(lmtwt1[lm_i], lmtwt2[lm_j]) for lm_j in places])

    selfavg = sum([confmat[i][i] for i in range(len(places))])
    mutavg = sum([sum(confmat[i]) for i in range(len(places))]) - selfavg
    selfavg /= float(len(places))
    mutavg /= float(len(places)*len(places) - len(places))
    print selfavg, mutavg


    plt.imshow(np.array(confmat), cmap = cm.gray, interpolation='nearest')
    plt.yticks(range(len(places)), \
            ['{0}: {1}'.format(place_name(places[i]), i) for i in range(len(places))])
    plt.xticks(range(len(places)))
    plt.subplots_adjust(left=0.4)
    plt.colorbar(shrink=0.66)
    plt.savefig('sf_confm.eps')
    plt.show()
Exemple #3
0
def kldiff(places):
    """ compare the difference of kl-divergence between tweets and web pages
        for each place in places
    """
    diff = Dataset()
    for pid in places:
        twt = loadrows(GEOTWEET, ('place_id', 'text'),
                ('place_id=\'{0}\''.format(pid),), 'sample',
                'order by rand() limit {0}'.format(100))
        web = loadrows(GEOTWEET, ('place_id', 'web'),
                ('place_id=\'{0}\''.format(pid),), 'web',
                'limit 25')
        lmref = lmfromtext(twt['text'][:50])
        lmtwt = lmfromtext(twt['text'][51:])
        lmweb = lmfromtext(web['web'])
        diff.append({'pid': pid, 'twtkld': kl_divergence(lmtwt, lmref),
            'webkld': kl_divergence(lmweb, lmref)})
    for item in diff:
        print '{0} & {1} & {2}'.format(place_name(item['pid']), item['twtkld'], item['webkld'])
Exemple #4
0
def kldiff(places):
    """ compare the difference of kl-divergence between tweets and web pages
        for each place in places
    """
    diff = Dataset()
    for pid in places:
        twt = loadrows(GEOTWEET, ('place_id', 'text'),
                       ('place_id=\'{0}\''.format(pid), ), 'sample',
                       'order by rand() limit {0}'.format(100))
        web = loadrows(GEOTWEET, ('place_id', 'web'),
                       ('place_id=\'{0}\''.format(pid), ), 'web', 'limit 25')
        lmref = lmfromtext(twt['text'][:50])
        lmtwt = lmfromtext(twt['text'][51:])
        lmweb = lmfromtext(web['web'])
        diff.append({
            'pid': pid,
            'twtkld': kl_divergence(lmtwt, lmref),
            'webkld': kl_divergence(lmweb, lmref)
        })
    for item in diff:
        print '{0} & {1} & {2}'.format(place_name(item['pid']), item['twtkld'],
                                       item['webkld'])