Beispiel #1
0
def sparsitysetup(nums):
    """ This setup considers the tweets from the places in the list and select
        some number of tweets from those places as testing tweets, the query is a block of tweets
        @arg city the place_id of the city
        @arg num the number of tweets generated
        @return a list() of tuple (text, cadidates)
    """
    lsts = linestyles()
    for num in nums:
        with open('chicago10.lst') as fin:
            twt = Dataset()
            places = [p.strip() for p in fin]
            lmplc = dict()
            lmtwt = Dataset()
            for pid in places:
                cur = CONN_POOL.get_cur(GEOTWEET)
                cur.execute('select text from sample' \
                        ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 160))
                text = [row['text'] for row in cur]
                lmplc[pid] = lmfromtext(text[:num])
                for txt in text[150:160]:
                    lmtwt.append({'pid': pid, 'lm': lmfromtext([txt,])})
            ranks = list()
            for item in lmtwt:
                ranks.append(ranke(lmplc, item['lm']))
            gch = batcheval(lmtwt['pid'], len(places), ranks)
            plt.plot(gch['pos'], gch['rate'],
                    lsts.next(), label='t={0}'.format(num))
    plt.xlabel('First $n$ places')
    plt.ylabel('Probability')
    plt.legend(loc='lower right')
    plt.show()
Beispiel #2
0
def confusionmatrix(places):
    """ Show the matrix of confusion between LMs by KL-divergence
    """
    lmtwt1 = dict()
    lmtwt2 = dict()
    for pid in places:
        cur = CONN_POOL.get_cur(GEOTWEET)
        cur.execute('select text from sample' \
                ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 200))
        text = [row['text'] for row in cur]
        lmtwt1[pid] = lmfromtext(text[:80])
        lmtwt2[pid] = lmfromtext(text[81:160])
    confmat = list()
    for lm_i in places:
        confmat.append(
            [kl_divergence(lmtwt1[lm_i], lmtwt2[lm_j]) for lm_j in places])

    selfavg = sum([confmat[i][i] for i in range(len(places))])
    mutavg = sum([sum(confmat[i]) for i in range(len(places))]) - selfavg
    selfavg /= float(len(places))
    mutavg /= float(len(places) * len(places) - len(places))
    print selfavg, mutavg

    plt.imshow(np.array(confmat), cmap=cm.gray, interpolation='nearest')
    plt.yticks(range(len(places)), \
            ['{0}: {1}'.format(place_name(places[i]), i) for i in range(len(places))])
    plt.xticks(range(len(places)))
    plt.subplots_adjust(left=0.4)
    plt.colorbar(shrink=0.66)
    plt.savefig('sf_confm.eps')
    plt.show()
Beispiel #3
0
def confusionmatrix(places):
    """ Show the matrix of confusion between LMs by KL-divergence
    """
    lmtwt1 = dict()
    lmtwt2 = dict()
    for pid in places:
        cur = CONN_POOL.get_cur(GEOTWEET)
        cur.execute('select text from sample' \
                ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 200))
        text = [row['text'] for row in cur]
        lmtwt1[pid] = lmfromtext(text[:80])
        lmtwt2[pid] = lmfromtext(text[81:160])
    confmat = list()
    for lm_i in places:
        confmat.append([kl_divergence(lmtwt1[lm_i], lmtwt2[lm_j]) for lm_j in places])

    selfavg = sum([confmat[i][i] for i in range(len(places))])
    mutavg = sum([sum(confmat[i]) for i in range(len(places))]) - selfavg
    selfavg /= float(len(places))
    mutavg /= float(len(places)*len(places) - len(places))
    print selfavg, mutavg


    plt.imshow(np.array(confmat), cmap = cm.gray, interpolation='nearest')
    plt.yticks(range(len(places)), \
            ['{0}: {1}'.format(place_name(places[i]), i) for i in range(len(places))])
    plt.xticks(range(len(places)))
    plt.subplots_adjust(left=0.4)
    plt.colorbar(shrink=0.66)
    plt.savefig('sf_confm.eps')
    plt.show()
Beispiel #4
0
def onesetup(places, numtwts, numtest, balance):
    """ This setup considers the tweets from the places in the list and select
        some number of tweets from those places as testing tweets, the query is just one tweet
        @arg city the place_id of the city
        @arg num the number of tweets generated
        @return a list() of tuple (text, cadidates)
    """
    lsts = linestyles()
    # prepare for data
    twtmodel = dict()
    webmodel = dict()
    twttest = Dataset()
    for pid in places:
        twtp = loadrows(GEOTWEET, ('place_id', 'text'),
                ('place_id=\'{0}\''.format(pid),), 'sample',
                'order by rand() limit {0}'.format(max(numtwts) + numtest))
        webmodel[pid] = loadrows(GEOTWEET, ('place_id', 'web'),
                ('place_id=\'{0}\''.format(pid),), 'web',
                'order by rand() limit 30')['web']
        twtmodel[pid] = twtp['text'][:max(numtwts)]
        for idx in range(max(numtwts) + 1, twtp.size()):
            twttest.append(twtp.item(idx))

    # ranking by twt and twt+web
    for numtwt in numtwts:
        lmtwt = dict()
        lmweb = dict()
        for pid in twtmodel.iterkeys():
            lmtwt[pid] = lmfromtext(twtmodel[pid][:numtwt])
            lmweb[pid] = lmfromtext(webmodel[pid])
        jointranks = list()
        for item in twttest:
            jointranks.append(joint_ranking(lmfromtext([item['text'],]), lmtwt, lmweb, balance))
        twtranks = list()
        for item in twttest:
            twtranks.append(kl_ranking(lmtwt, lmfromtext([item['text'],])))
        gjoint = batcheval(twttest['place_id'], len(places), jointranks)
        gtwt = batcheval(twttest['place_id'], len(places), twtranks)
        plt.plot(gjoint['pos'], gjoint['rate'], marker='^',
                label='JOINT($t={0}$)'.format(numtwt), linestyle=lsts.next())
        plt.plot(gtwt['pos'], gtwt['rate'], marker='o',
                label='TWEET($t={0}$)'.format(numtwt), linestyle=lsts.next())

    webranks = list()
    for item in twttest:
        webranks.append(kl_ranking(lmweb, lmfromtext([item['text'],])))
    gweb = batcheval(twttest['place_id'], len(places), webranks)
    plt.plot(gweb['pos'], gweb['rate'], label='WEB', linestyle='dotted')
    plt.plot(lmeval['pos'], [float(r) / max(lmeval['pos']) for r in lmeval['pos']],
             ls='-.', marker='s',
             label='Random Baseline')
    plt.xlabel('First $n$ Places')
    plt.ylabel('Probability')
    plt.legend(loc='lower right')
    plt.show()
Beispiel #5
0
def kldiff(places):
    """ compare the difference of kl-divergence between tweets and web pages
        for each place in places
    """
    diff = Dataset()
    for pid in places:
        twt = loadrows(GEOTWEET, ('place_id', 'text'),
                ('place_id=\'{0}\''.format(pid),), 'sample',
                'order by rand() limit {0}'.format(100))
        web = loadrows(GEOTWEET, ('place_id', 'web'),
                ('place_id=\'{0}\''.format(pid),), 'web',
                'limit 25')
        lmref = lmfromtext(twt['text'][:50])
        lmtwt = lmfromtext(twt['text'][51:])
        lmweb = lmfromtext(web['web'])
        diff.append({'pid': pid, 'twtkld': kl_divergence(lmtwt, lmref),
            'webkld': kl_divergence(lmweb, lmref)})
    for item in diff:
        print '{0} & {1} & {2}'.format(place_name(item['pid']), item['twtkld'], item['webkld'])
Beispiel #6
0
def kldiff(places):
    """ compare the difference of kl-divergence between tweets and web pages
        for each place in places
    """
    diff = Dataset()
    for pid in places:
        twt = loadrows(GEOTWEET, ('place_id', 'text'),
                       ('place_id=\'{0}\''.format(pid), ), 'sample',
                       'order by rand() limit {0}'.format(100))
        web = loadrows(GEOTWEET, ('place_id', 'web'),
                       ('place_id=\'{0}\''.format(pid), ), 'web', 'limit 25')
        lmref = lmfromtext(twt['text'][:50])
        lmtwt = lmfromtext(twt['text'][51:])
        lmweb = lmfromtext(web['web'])
        diff.append({
            'pid': pid,
            'twtkld': kl_divergence(lmtwt, lmref),
            'webkld': kl_divergence(lmweb, lmref)
        })
    for item in diff:
        print '{0} & {1} & {2}'.format(place_name(item['pid']), item['twtkld'],
                                       item['webkld'])
Beispiel #7
0
def sparsitysetup(nums):
    """ This setup considers the tweets from the places in the list and select
        some number of tweets from those places as testing tweets, the query is a block of tweets
        @arg city the place_id of the city
        @arg num the number of tweets generated
        @return a list() of tuple (text, cadidates)
    """
    lsts = linestyles()
    for num in nums:
        with open('chicago10.lst') as fin:
            twt = Dataset()
            places = [p.strip() for p in fin]
            lmplc = dict()
            lmtwt = Dataset()
            for pid in places:
                cur = CONN_POOL.get_cur(GEOTWEET)
                cur.execute('select text from sample' \
                        ' where place_id = \'{0}\' order by rand() limit {1}'.format(pid, 160))
                text = [row['text'] for row in cur]
                lmplc[pid] = lmfromtext(text[:num])
                for txt in text[150:160]:
                    lmtwt.append({
                        'pid': pid,
                        'lm': lmfromtext([
                            txt,
                        ])
                    })
            ranks = list()
            for item in lmtwt:
                ranks.append(ranke(lmplc, item['lm']))
            gch = batcheval(lmtwt['pid'], len(places), ranks)
            plt.plot(gch['pos'],
                     gch['rate'],
                     lsts.next(),
                     label='t={0}'.format(num))
    plt.xlabel('First $n$ places')
    plt.ylabel('Probability')
    plt.legend(loc='lower right')
    plt.show()
Beispiel #8
0
def onesetup(places, numtwts, numtest, balance):
    """ This setup considers the tweets from the places in the list and select
        some number of tweets from those places as testing tweets, the query is just one tweet
        @arg city the place_id of the city
        @arg num the number of tweets generated
        @return a list() of tuple (text, cadidates)
    """
    lsts = linestyles()
    # prepare for data
    twtmodel = dict()
    webmodel = dict()
    twttest = Dataset()
    for pid in places:
        twtp = loadrows(
            GEOTWEET, ('place_id', 'text'), ('place_id=\'{0}\''.format(pid), ),
            'sample',
            'order by rand() limit {0}'.format(max(numtwts) + numtest))
        webmodel[pid] = loadrows(GEOTWEET, ('place_id', 'web'),
                                 ('place_id=\'{0}\''.format(pid), ), 'web',
                                 'order by rand() limit 30')['web']
        twtmodel[pid] = twtp['text'][:max(numtwts)]
        for idx in range(max(numtwts) + 1, twtp.size()):
            twttest.append(twtp.item(idx))

    # ranking by twt and twt+web
    for numtwt in numtwts:
        lmtwt = dict()
        lmweb = dict()
        for pid in twtmodel.iterkeys():
            lmtwt[pid] = lmfromtext(twtmodel[pid][:numtwt])
            lmweb[pid] = lmfromtext(webmodel[pid])
        jointranks = list()
        for item in twttest:
            jointranks.append(
                joint_ranking(lmfromtext([
                    item['text'],
                ]), lmtwt, lmweb, balance))
        twtranks = list()
        for item in twttest:
            twtranks.append(kl_ranking(lmtwt, lmfromtext([
                item['text'],
            ])))
        gjoint = batcheval(twttest['place_id'], len(places), jointranks)
        gtwt = batcheval(twttest['place_id'], len(places), twtranks)
        plt.plot(gjoint['pos'],
                 gjoint['rate'],
                 marker='^',
                 label='JOINT($t={0}$)'.format(numtwt),
                 linestyle=lsts.next())
        plt.plot(gtwt['pos'],
                 gtwt['rate'],
                 marker='o',
                 label='TWEET($t={0}$)'.format(numtwt),
                 linestyle=lsts.next())

    webranks = list()
    for item in twttest:
        webranks.append(kl_ranking(lmweb, lmfromtext([
            item['text'],
        ])))
    gweb = batcheval(twttest['place_id'], len(places), webranks)
    plt.plot(gweb['pos'], gweb['rate'], label='WEB', linestyle='dotted')
    plt.plot(lmeval['pos'],
             [float(r) / max(lmeval['pos']) for r in lmeval['pos']],
             ls='-.',
             marker='s',
             label='Random Baseline')
    plt.xlabel('First $n$ Places')
    plt.ylabel('Probability')
    plt.legend(loc='lower right')
    plt.show()