Beispiel #1
0
def cmptimeweb(cities, numtwts, numtest):
    """ compare the time model + web model to original pure text model
    """
    lmranks = [list() for i in range(len(numtwts))]
    tmranks = [list() for i in range(len(numtwts))]
    wmranks = list()
    randranks = list()
    lmtmranks = [list() for i in range(len(numtwts))]
    wmlmranks = [list() for i in range(len(numtwts))]
    wmlmtmranks = [list() for i in range(len(numtwts))]
    test = Dataset()

    for places in cities:
        lms = [dict() for i in range(len(numtwts))]
        tms = [dict() for i in range(len(numtwts))]
        wms = dict()
        tst = Dataset()
        for pid in places:
            twtp = loadrows(GEOTWEET, ('place_id', 'text', 'created_at'),
                    ('place_id=\'{0}\''.format(pid),), 'sample',
                    'order by rand() limit {0}'.format(max(numtwts) + numtest))
            for i in range(len(numtwts)):
                lms[i][pid] = LanguageModel(twtp['text'][:numtwts[i]])
                tms[i][pid] = TimeModel(twtp['created_at'][:numtwts[i]])
            web = loadrows(GEOTWEET, ('place_id', 'web'),
                    ('place_id=\'{0}\''.format(pid),), 'web',
                    'order by rand() limit 30')
            wms[pid] = LanguageModel(web['web'])

            # test data
            for i in range(max(numtwts), max(numtwts) + numtest):
                tst.append({'label': pid,
                    'lm': LanguageModel([twtp['text'][i],]),
                    'tm': TimeModel([twtp['created_at'][i],])})

        test.extend(tst)
        # rank
        for item in tst:
            for i in range(len(numtwts)):
                lmranks[i].append(ranke(lms[i], item['lm']))
                tmranks[i].append(ranke(tms[i], item['tm']))
            wmranks.append(ranke(wms, item['lm']))
            randranks.append(randranke(places))

    for i in range(len(numtwts)):
        for ranklm, ranktm in zip(lmranks[i], tmranks[i]):
            lmtmranks[i].append(linearjoin([ranklm, ranktm], [0.5, 0.5]))
        for ranklm, rankwm in zip(lmranks[i], wmranks):
            wmlmranks[i].append(linearjoin([ranklm, rankwm], [0.5, 0.5]))
        for ranklm, ranktm, rankwm in zip(lmranks[i], tmranks[i], wmranks):
            wmlmtmranks[i].append(\
                    linearjoin([ranklm, ranktm, rankwm], [0.33, 0.33, 0.33]))

    # plot
    candls = ['-', '--']
    mks = ['o', '^', '*', 'v', 's']

    #for i in range(len(numtwts)):
        #lmeval = batcheval(lmranks[i], test['label'])
        #plt.plot(lmeval['pos'], lmeval['rate'],
                #label='tweet(s={0})'.format(numtwts[i]),
                #ls=candls[i%2], marker=mks[i/2])
    #for i in range(len(numtwts)):
        #for plc in placetotalrank(lmranks[i], test)['label'][-10:]:
            #print place_name(plc), plc
        #print placetotalrank(lmranks[i], test)['totalrank'][-10:]
        #print wilcoxontest(lmranks[i], lmranks[i-1], test)
    #plt.legend(loc='lower right')
#---------------------------------------------------------------
    for i in range(len(numtwts)):
        lmeval = batcheval(lmranks[i], test['label'])
        plt.plot(lmeval['pos'], lmeval['rate'],
                label='tweet(s={0})'.format(numtwts[i]),
                ls=candls[i], marker='o')
        wmlmeval = batcheval(wmlmranks[i], test['label'])
        plt.plot(wmlmeval['pos'], wmlmeval['rate'],
                label='tweet(s={0})+web'.format(numtwts[i]),
                ls=candls[i], marker='^')
        print wilcoxontest(lmranks[i], wmlmranks[i], test)
        for plc in placetotalrank(wmlmranks[i], test)['label'][-10:]:
            print place_name(plc), plc
        print placetotalrank(wmlmranks[i], test)['totalrank'][-10:]
    wmeval = batcheval(wmranks, test['label'])
    for plc in placetotalrank(wmranks, test)['label'][-10:]:
        print place_name(plc), plc
    print placetotalrank(wmranks, test)['totalrank'][-10:]
    plt.plot(wmeval['pos'], wmeval['rate'],
            label='web',
            ls=':')

    plt.plot(lmeval['pos'], [float(r) / max(lmeval['pos']) for r in lmeval['pos']],
             ls='-.', marker='s',
             label='Random Baseline')
#---------------------------------------------------------------


    #for i in range(len(numtwts)):
        #plt.subplot(121 + i)
        #plt.title('$s={0}$'.format(numtwts[i]))
        #lmeval = batcheval(lmranks[i], test['label'])
        #plt.plot(lmeval['pos'], lmeval['rate'],
                #label='tweet',
                #ls=candls[i], marker='o')
        #lmtmeval = batcheval(lmtmranks[i], test['label'])
        #plt.plot(lmtmeval['pos'], lmtmeval['rate'],
                #label='tweet+time',
                #ls=candls[i], marker='^')
        #wmlmtmeval = batcheval(wmlmtmranks[i], test['label'])
        #plt.plot(wmlmtmeval['pos'], wmlmtmeval['rate'],
                #label='tweet+time+web',
                #ls=candls[i], marker='*')
        #plt.legend(loc='lower right')
        #plt.ylabel('Rate containing Reference POI')
        #plt.xlabel('Top $p$ places')
    #plt.show()
#---------------------------------------------------------------
    #i=0
    #plt.subplot(121 + i)
    #plt.title('$s={0}$'.format(numtwts[i]))
    #tmeval = batcheval(tmranks[i], test['label'])
    #plt.plot(tmeval['pos'], tmeval['rate'],
            #label='time',
            #ls=candls[i], marker='o')
    #lmeval = batcheval(lmranks[i], test['label'])
    #plt.plot(lmeval['pos'], lmeval['rate'],
            #label='tweet',
            #ls=candls[i], marker='^')
    #lmtmeval = batcheval(lmtmranks[i], test['label'])
    #plt.plot(lmtmeval['pos'], lmtmeval['rate'],
            #label='tweet+time',
            #ls=candls[i], marker='*')
    #for plc in placetotalrank(tmranks[i], test)['label'][-10:]:
        #print place_name(plc), plc
    #print placetotalrank(tmranks[i], test)['totalrank'][-10:]
    #for plc in placetotalrank(lmtmranks[i], test)['label'][-10:]:
        #print place_name(plc), plc
    #print placetotalrank(lmtmranks[i], test)['totalrank'][-10:]
    #print wilcoxontest(lmranks[i], lmtmranks[i], test)

    #plt.legend(loc='lower right')
    #plt.ylabel('Rate containing Reference POI')
    #plt.xlabel('Top $p$ places')


    #i=1
    #plt.subplot(121 + i)
    #plt.title('$s={0}$'.format(numtwts[i]))
    #tmeval = batcheval(tmranks[i], test['label'])
    #plt.plot(tmeval['pos'], tmeval['rate'],
            #label='time',
            #ls=candls[i], marker='o')
    #wmlmeval = batcheval(wmlmranks[i], test['label'])
    #plt.plot(wmlmeval['pos'], wmlmeval['rate'],
            #label='tweet + web',
            #ls=candls[i], marker='^')
    #wmlmtmeval = batcheval(wmlmtmranks[i], test['label'])
    #plt.plot(wmlmtmeval['pos'], wmlmtmeval['rate'],
            #label='tweet+time+web',
            #ls=candls[i], marker='*')

    #for plc in placetotalrank(wmlmranks[i], test)['label'][-10:]:
        #print place_name(plc), plc
    #print placetotalrank(wmlmranks[i], test)['totalrank'][-10:]
    #for plc in placetotalrank(wmlmtmranks[i], test)['label'][-10:]:
        #print place_name(plc), plc
    #print placetotalrank(wmlmtmranks[i], test)['totalrank'][-10:]
    #print wilcoxontest(wmlmranks[i], wmlmtmranks[i], test)

    plt.legend(loc='lower right')
    plt.ylabel('Rate containing Reference POI')
    plt.xlabel('Top $p$ places')

    plt.show()
Beispiel #2
0
def cmptimeweb(cities, numtwts, numtest):
    """ compare the time model + web model to original pure text model
    """
    lmranks = [list() for i in range(len(numtwts))]
    tmranks = [list() for i in range(len(numtwts))]
    wmranks = list()
    randranks = list()
    lmtmranks = [list() for i in range(len(numtwts))]
    wmlmranks = [list() for i in range(len(numtwts))]
    wmlmtmranks = [list() for i in range(len(numtwts))]
    test = Dataset()

    for places in cities:
        lms = [dict() for i in range(len(numtwts))]
        tms = [dict() for i in range(len(numtwts))]
        wms = dict()
        tst = Dataset()
        for pid in places:
            twtp = loadrows(
                GEOTWEET, ('place_id', 'text', 'created_at'),
                ('place_id=\'{0}\''.format(pid), ), 'sample',
                'order by rand() limit {0}'.format(max(numtwts) + numtest))
            for i in range(len(numtwts)):
                lms[i][pid] = LanguageModel(twtp['text'][:numtwts[i]])
                tms[i][pid] = TimeModel(twtp['created_at'][:numtwts[i]])
            web = loadrows(GEOTWEET, ('place_id', 'web'),
                           ('place_id=\'{0}\''.format(pid), ), 'web',
                           'order by rand() limit 30')
            wms[pid] = LanguageModel(web['web'])

            # test data
            for i in range(max(numtwts), max(numtwts) + numtest):
                tst.append({
                    'label': pid,
                    'lm': LanguageModel([
                        twtp['text'][i],
                    ]),
                    'tm': TimeModel([
                        twtp['created_at'][i],
                    ])
                })

        test.extend(tst)
        # rank
        for item in tst:
            for i in range(len(numtwts)):
                lmranks[i].append(ranke(lms[i], item['lm']))
                tmranks[i].append(ranke(tms[i], item['tm']))
            wmranks.append(ranke(wms, item['lm']))
            randranks.append(randranke(places))

    for i in range(len(numtwts)):
        for ranklm, ranktm in zip(lmranks[i], tmranks[i]):
            lmtmranks[i].append(linearjoin([ranklm, ranktm], [0.5, 0.5]))
        for ranklm, rankwm in zip(lmranks[i], wmranks):
            wmlmranks[i].append(linearjoin([ranklm, rankwm], [0.5, 0.5]))
        for ranklm, ranktm, rankwm in zip(lmranks[i], tmranks[i], wmranks):
            wmlmtmranks[i].append(\
                    linearjoin([ranklm, ranktm, rankwm], [0.33, 0.33, 0.33]))

    # plot
    candls = ['-', '--']
    mks = ['o', '^', '*', 'v', 's']

    #for i in range(len(numtwts)):
    #lmeval = batcheval(lmranks[i], test['label'])
    #plt.plot(lmeval['pos'], lmeval['rate'],
    #label='tweet(s={0})'.format(numtwts[i]),
    #ls=candls[i%2], marker=mks[i/2])
    #for i in range(len(numtwts)):
    #for plc in placetotalrank(lmranks[i], test)['label'][-10:]:
    #print place_name(plc), plc
    #print placetotalrank(lmranks[i], test)['totalrank'][-10:]
    #print wilcoxontest(lmranks[i], lmranks[i-1], test)
    #plt.legend(loc='lower right')
    #---------------------------------------------------------------
    for i in range(len(numtwts)):
        lmeval = batcheval(lmranks[i], test['label'])
        plt.plot(lmeval['pos'],
                 lmeval['rate'],
                 label='tweet(s={0})'.format(numtwts[i]),
                 ls=candls[i],
                 marker='o')
        wmlmeval = batcheval(wmlmranks[i], test['label'])
        plt.plot(wmlmeval['pos'],
                 wmlmeval['rate'],
                 label='tweet(s={0})+web'.format(numtwts[i]),
                 ls=candls[i],
                 marker='^')
        print wilcoxontest(lmranks[i], wmlmranks[i], test)
        for plc in placetotalrank(wmlmranks[i], test)['label'][-10:]:
            print place_name(plc), plc
        print placetotalrank(wmlmranks[i], test)['totalrank'][-10:]
    wmeval = batcheval(wmranks, test['label'])
    for plc in placetotalrank(wmranks, test)['label'][-10:]:
        print place_name(plc), plc
    print placetotalrank(wmranks, test)['totalrank'][-10:]
    plt.plot(wmeval['pos'], wmeval['rate'], label='web', ls=':')

    plt.plot(lmeval['pos'],
             [float(r) / max(lmeval['pos']) for r in lmeval['pos']],
             ls='-.',
             marker='s',
             label='Random Baseline')
    #---------------------------------------------------------------

    #for i in range(len(numtwts)):
    #plt.subplot(121 + i)
    #plt.title('$s={0}$'.format(numtwts[i]))
    #lmeval = batcheval(lmranks[i], test['label'])
    #plt.plot(lmeval['pos'], lmeval['rate'],
    #label='tweet',
    #ls=candls[i], marker='o')
    #lmtmeval = batcheval(lmtmranks[i], test['label'])
    #plt.plot(lmtmeval['pos'], lmtmeval['rate'],
    #label='tweet+time',
    #ls=candls[i], marker='^')
    #wmlmtmeval = batcheval(wmlmtmranks[i], test['label'])
    #plt.plot(wmlmtmeval['pos'], wmlmtmeval['rate'],
    #label='tweet+time+web',
    #ls=candls[i], marker='*')
    #plt.legend(loc='lower right')
    #plt.ylabel('Rate containing Reference POI')
    #plt.xlabel('Top $p$ places')
    #plt.show()
    #---------------------------------------------------------------
    #i=0
    #plt.subplot(121 + i)
    #plt.title('$s={0}$'.format(numtwts[i]))
    #tmeval = batcheval(tmranks[i], test['label'])
    #plt.plot(tmeval['pos'], tmeval['rate'],
    #label='time',
    #ls=candls[i], marker='o')
    #lmeval = batcheval(lmranks[i], test['label'])
    #plt.plot(lmeval['pos'], lmeval['rate'],
    #label='tweet',
    #ls=candls[i], marker='^')
    #lmtmeval = batcheval(lmtmranks[i], test['label'])
    #plt.plot(lmtmeval['pos'], lmtmeval['rate'],
    #label='tweet+time',
    #ls=candls[i], marker='*')
    #for plc in placetotalrank(tmranks[i], test)['label'][-10:]:
    #print place_name(plc), plc
    #print placetotalrank(tmranks[i], test)['totalrank'][-10:]
    #for plc in placetotalrank(lmtmranks[i], test)['label'][-10:]:
    #print place_name(plc), plc
    #print placetotalrank(lmtmranks[i], test)['totalrank'][-10:]
    #print wilcoxontest(lmranks[i], lmtmranks[i], test)

    #plt.legend(loc='lower right')
    #plt.ylabel('Rate containing Reference POI')
    #plt.xlabel('Top $p$ places')

    #i=1
    #plt.subplot(121 + i)
    #plt.title('$s={0}$'.format(numtwts[i]))
    #tmeval = batcheval(tmranks[i], test['label'])
    #plt.plot(tmeval['pos'], tmeval['rate'],
    #label='time',
    #ls=candls[i], marker='o')
    #wmlmeval = batcheval(wmlmranks[i], test['label'])
    #plt.plot(wmlmeval['pos'], wmlmeval['rate'],
    #label='tweet + web',
    #ls=candls[i], marker='^')
    #wmlmtmeval = batcheval(wmlmtmranks[i], test['label'])
    #plt.plot(wmlmtmeval['pos'], wmlmtmeval['rate'],
    #label='tweet+time+web',
    #ls=candls[i], marker='*')

    #for plc in placetotalrank(wmlmranks[i], test)['label'][-10:]:
    #print place_name(plc), plc
    #print placetotalrank(wmlmranks[i], test)['totalrank'][-10:]
    #for plc in placetotalrank(wmlmtmranks[i], test)['label'][-10:]:
    #print place_name(plc), plc
    #print placetotalrank(wmlmtmranks[i], test)['totalrank'][-10:]
    #print wilcoxontest(wmlmranks[i], wmlmtmranks[i], test)

    plt.legend(loc='lower right')
    plt.ylabel('Rate containing Reference POI')
    plt.xlabel('Top $p$ places')

    plt.show()