def richrank(cities, names): candls = ['-', '--', '.-'] mks = ['o', '^', '*', 'v', 's'] for idx in range(len(cities)): lms = dict() test = Dataset() for pid in cities[idx]: twtp = loadrows(GEOTWEET, ('place_id', 'text', 'created_at'), ('place_id=\'{0}\''.format(pid), ), 'sample_switch_place_cate', 'order by rand() limit 110') lms[pid] = LanguageModel(twtp['text'][:100]) for cnt in range(100, 110): test.append({ 'label': twtp['place_id'][cnt], 'lm': LanguageModel([ twtp['text'][cnt], ]) }) lmranks = list() for twtlm in test: lmranks.append(ranke(lms, twtlm['lm'])) lmeval = batcheval(lmranks, test['label']) plt.plot(lmeval['pos'], lmeval['rate'], ls=candls[idx % 2], marker=mks[idx / 2], label='{0}($s=100$)'.format(names[idx])) plt.legend(loc='lower right') plt.ylabel('Rate containing referece POI') plt.xlabel('Top $p$ places') plt.show()
def richrank(cities, names): candls = ['-', '--'] mks = ['o', '^', '*'] for idx in range(len(cities)): lms = dict() test = Dataset() for pid in cities[idx]: twtp = loadrows(GEOTWEET, ('place_id', 'text', 'created_at'), ('place_id=\'{0}\''.format(pid),), 'sample', 'order by rand() limit 110') lms[pid] = LanguageModel(twtp['text'][:100]) for cnt in range(100, 110): test.append({'label': twtp['place_id'][cnt], 'lm': LanguageModel([twtp['text'][cnt],])}) lmranks = list() randranks = list() for twtlm in test: lmranks.append(ranke(lms, twtlm['lm'])) randranks.append(randranke(cities[idx])) lmeval = batcheval(lmranks, test['label']) print names[idx], 'P@1', (lmeval['rate'][1] - 0.1) plt.plot(lmeval['pos'], lmeval['rate'], ls=candls[idx%2], marker=mks[idx/2], label='{0}($s=100$)'.format(names[idx])) plt.plot(lmeval['pos'], [float(r) / max(lmeval['pos']) for r in lmeval['pos']], ls='-.', marker='s', label='Random Baseline') plt.legend(loc='lower right') plt.ylabel('Rate containing referece POI') plt.xlabel('Top $p$ places') plt.show()
def linearjoin(ranks, balance): """ Join a set of ranks by the weights in balance @arg ranks a list() of ranks that need to combine @arg balance a list() of floats indicating combine weights @return a combined rank """ for rak in ranks: rak["score"] = unitscore(rak["score"]) sortedranks = [[item for item in r.sorted_items("label")] for r in ranks] unitrank = list() for idx in range(len(sortedranks[0])): unitrank.append( ( sortedranks[0][idx]["label"], sum( [ score * bl for score, bl in zip([sortedranks[i][idx]["score"] for i in range(len(sortedranks))], balance) ] ), ) ) unitrank = sorted(unitrank, key=itemgetter(1), reverse=False) res = Dataset() for item in unitrank: res.append({"label": item[0], "score": item[1]}) return res
def f_tf(text): """get the term frequency feature @arg text list() of str() @return Dataset() of term vectors """ dset = Dataset() for line in text: dset.append(line2tf(line)) return dset
def randranke(cand): """ Return a random sorted cand """ rak = list() for lbl in cand: rak.append((lbl, random.random())) rak = sorted(rak, key=itemgetter(1)) res = Dataset() for item in rak: res.append({"label": item[0], "score": item[1]}) return res
def randranke(cand): """ Return a random sorted cand """ rak = list() for lbl in cand: rak.append((lbl, random.random())) rak = sorted(rak, key=itemgetter(1)) res = Dataset() for item in rak: res.append({'label': item[0], 'score': item[1]}) return res
def batcheval(ranks, expects, maxpos=-1): """ The batch version of evaluate() which test a set of pos @arg ranks list() of ranks returned by webguess() @arg expects list() of expected labels @arg maxpos the max position @return a list of rates of goals """ if maxpos == -1: maxpos = ranks[0].size() goalrates = Dataset() for pos in range(maxpos + 1): goalrates.append({'pos': pos, 'rate': evaluate(ranks, expects, pos)}) return goalrates
def norm_v2(dset): """normalize values in vector wise, row normalization (L_2) @arg dset Dataset() of vectors @return Dataset() of vectors normalized """ ndset = Dataset() for idx in range(dset.size()): sqrval = math.sqrt(sum(dset[key][idx]**2 for key in dset)) item = DataItem() for key in dset.iterkeys(): item[key] = dset[key][idx] / sqrval ndset.append(item) return ndset
def gen_crs_arff(self, dst, fold, key_lst=None, \ typemap=dict({'__CLASS__': 'DISC'}), \ default_type = 'NUMERIC'): """generate dataset for cross validation""" clses = dict() for i in range(len(self)): if self[i]['__CLASS__'] not in clses: clses[self[i]['__CLASS__']] = dict() clses[self[i]['__CLASS__']]['list'] = list() clses[self[i]['__CLASS__']]['list'].append(i) for cls in clses: random.shuffle(clses[cls]['list']) clses[cls]['fold'] = list_split(clses[cls]['list'], fold) for i in range(fold): test = Dataset() train = Dataset() for cls in clses.iterkeys(): test.extend([self[f] for f in clses[cls]['fold'][i]]) for j in range(fold): if j != i: train.extend([self[f] for f in clses[cls]['fold'][j]]) gen_arff(test, '{0}.test.{1}.arff'.format(dst, i), key_lst, \ typemap, default_type) gen_arff(train, '{0}.train.{1}.arff'.format(dst, i), key_lst, \ typemap, default_type)
def ranke(cand, ref, **kargs): """ Rank the models in cand according to the scoring methods in the models @arg cand dict() of (label, model) @arg ref model for testing @return an ordered Dataset() of {'label', 'score'} """ rak = list() for lbl in cand.iterkeys(): rak.append((lbl, cand[lbl].score(ref, **kargs))) rak = sorted(rak, key=itemgetter(1), reverse=not ref.isasc()) res = Dataset() for item in rak: res.append({"label": item[0], "score": item[1]}) return res
def ranke(cand, ref, **kargs): """ Rank the models in cand according to the scoring methods in the models @arg cand dict() of (label, model) @arg ref model for testing @return an ordered Dataset() of {'label', 'score'} """ rak = list() for lbl in cand.iterkeys(): rak.append((lbl, cand[lbl].score(ref, **kargs))) rak = sorted(rak, key=itemgetter(1), reverse=not ref.isasc()) res = Dataset() for item in rak: res.append({'label': item[0], 'score': item[1]}) return res
def cmpsparse(cities, numtwts, numtest): """ Compare the model performance trained with different amount of tweets """ lmranks = [list() for i in range(len(numtwts))] randranks = list() lmtmranks = [list() for i in range(len(numtwts))] test = Dataset() for places in cities: lms = [dict() for i in range(len(numtwts))] tst = Dataset() for pid in places: twtp = loadrows( GEOTWEET, ('place_id', 'text', 'created_at'), ('place_id=\'{0}\''.format(pid), ), 'sample', 'order by rand() limit {0}'.format(max(numtwts) + numtest)) for i in range(len(numtwts)): lms[i][pid] = LanguageModel(twtp['text'][:numtwts[i]]) # test data for i in range(max(numtwts), max(numtwts) + numtest): tst.append({ 'label': pid, 'lm': LanguageModel([ twtp['text'][i], ]), }) test.extend(tst) # rank for item in tst: for i in range(len(numtwts)): lmranks[i].append(ranke(lms[i], item['lm'])) randranks.append(randranke(places)) # plot candls = ['-', '--'] mks = ['o', '^', '*', 'v', 's'] for i, n in enumerate(numtwts): lmeval = batcheval(lmranks[i], test['label']) plt.plot(lmeval['pos'], lmeval['rate'], label='tweet(s={0})'.format(n), marker=mks[i]) plt.plot(lmeval['pos'], [float(r) / max(lmeval['pos']) for r in lmeval['pos']], ls='-.', marker='s', label='Random Baseline') plt.legend(loc='lower right') plt.ylabel('Rate containing Reference POI') plt.xlabel('Top $p$ places') plt.show()
def log_parse(src): """parse predication output from WEKA""" ins_lst = Dataset() with open(src) as fsrc: for line in fsrc: line, dummy = _SYMBOL.subn(' ', line) col = _SPACE.split(line) ins = DataItem() ins['ref'] = int((_CLSNO.split(col[2]))[0]) ins['refN'] = (_CLSNO.split(col[2]))[1] ins['prd'] = int((_CLSNO.split(col[3]))[0]) ins['prdN'] = (_CLSNO.split(col[3]))[1] ins['err'] = True if col[4] == '+' else False ins['score'] = [float(col[i]) for i in range(4, len(col) - 2)] ids, dummy = _PARATH.subn('', col[len(col) - 2]) ins['id'] = int(ids) ins_lst.append(ins) return ins_lst
def placetotalrank(ranks, expects): """ Return the sum of position of reference POIs in ranks @arg ranks a list() of ranks(dataset(){'label', 'score'}) @arg expects a list() of testitems(dataset(){'label', ...}) @return a sorted dataset(){'label', 'totalrank'} """ plcrak = dict() for i in range(expects.size()): if expects['label'][i] not in plcrak: plcrak[expects['label'][i]] = 0 for j in range(ranks[i].size()): if ranks[i]['label'][j] == expects['label'][i]: plcrak[expects['label'][i]] += j break items = sorted(plcrak.iteritems(), key=itemgetter(1)) res = Dataset() for item in items: res.append({'label': item[0], 'totalrank': item[1]}) return res
def norm_e(dset): """normalize values in element wise, i.e., column normalization @arg dset Dataset() of vectors @return Dataset() of vectors normalized """ ndset = Dataset() for key in dset.iterkeys(): maxval = max(dset[key]) ndset[key] = list() for val in dset[key]: ndset[key].append(val/maxval) return ndset
def linearjoin(ranks, balance): """ Join a set of ranks by the weights in balance @arg ranks a list() of ranks that need to combine @arg balance a list() of floats indicating combine weights @return a combined rank """ for rak in ranks: rak['score'] = unitscore(rak['score']) sortedranks = [[item for item in r.sorted_items('label')] for r in ranks] unitrank = list() for idx in range(len(sortedranks[0])): unitrank.append((sortedranks[0][idx]['label'], sum([score * bl for score, bl in \ zip([sortedranks[i][idx]['score'] for i in range(len(sortedranks))], balance)]))) unitrank = sorted(unitrank, key=itemgetter(1), reverse=False) res = Dataset() for item in unitrank: res.append({'label': item[0], 'score': item[1]}) return res
def cmpsparse(cities, numtwts, numtest): """ Compare the model performance trained with different amount of tweets """ lmranks = [list() for i in range(len(numtwts))] randranks = list() lmtmranks = [list() for i in range(len(numtwts))] test = Dataset() for places in cities: lms = [dict() for i in range(len(numtwts))] tst = Dataset() for pid in places: twtp = loadrows(GEOTWEET, ('place_id', 'text', 'created_at'), ('place_id=\'{0}\''.format(pid),), 'sample', 'order by rand() limit {0}'.format(max(numtwts) + numtest)) for i in range(len(numtwts)): lms[i][pid] = LanguageModel(twtp['text'][:numtwts[i]]) # test data for i in range(max(numtwts), max(numtwts) + numtest): tst.append({'label': pid, 'lm': LanguageModel([twtp['text'][i],]), }) test.extend(tst) # rank for item in tst: for i in range(len(numtwts)): lmranks[i].append(ranke(lms[i], item['lm'])) randranks.append(randranke(places)) # plot candls = ['-', '--'] mks = ['o', '^', '*', 'v', 's'] for i, n in enumerate(numtwts): lmeval = batcheval(lmranks[i], test['label']) plt.plot(lmeval['pos'], lmeval['rate'], label='tweet(s={0})'.format(n), marker=mks[i]) plt.plot(lmeval['pos'], [float(r) / max(lmeval['pos']) for r in lmeval['pos']], ls='-.', marker='s', label='Random Baseline') plt.legend(loc='lower right') plt.ylabel('Rate containing Reference POI') plt.xlabel('Top $p$ places') plt.show()
def cmptimeweb(cities, numtwts, numtest): """ compare the time model + web model to original pure text model """ lmranks = [list() for _ in numtwts] tmranks = [list() for _ in numtwts] wmranks = list() randranks = list() lmtmranks = [list() for _ in numtwts] wmlmranks = [list() for _ in numtwts] wmlmtmranks = [list() for _ in numtwts] test = Dataset() for city in cities: lms = [dict() for _ in numtwts] tms = [dict() for _ in numtwts] wms = dict() tst = Dataset() for pid in city: twtp = loadrows( GEOTWEET, ('place_id', 'text', 'created_at'), ('place_id=\'{0}\''.format(pid), ), 'sample_switch_place_cate', 'order by rand() limit {0}'.format(max(numtwts) + numtest)) for i, n in enumerate(numtwts): lms[i][pid] = LanguageModel(twtp['text'][:n]) tms[i][pid] = TimeModel(twtp['created_at'][:n]) web = loadrows(GEOTWEET, ('place_id', 'web'), ('place_id=\'{0}\''.format(pid), ), 'web', 'order by rand() limit 30') try: wms[pid] = LanguageModel(web['web']) except KeyError: wms[pid] = LanguageModel('') # Prepare test data by the tail part of the data retrieved from db test_pos = max(numtwts) for i in range(test_pos, test_pos + numtest): tst.append({ 'label': pid, 'lm': LanguageModel([ twtp['text'][i], ]), 'tm': TimeModel([ twtp['created_at'][i], ]) }) test.extend(tst) # rank for item in tst: for i, _ in enumerate(numtwts): lmranks[i].append(ranke(lms[i], item['lm'])) tmranks[i].append(ranke(tms[i], item['tm'])) wmranks.append(ranke(wms, item['lm'])) randranks.append(randranke(city)) for i in range(len(numtwts)): for ranklm, ranktm in zip(lmranks[i], tmranks[i]): lmtmranks[i].append(linearjoin([ranklm, ranktm], [0.5, 0.5])) for ranklm, rankwm in zip(lmranks[i], wmranks): wmlmranks[i].append(linearjoin([ranklm, rankwm], [0.5, 0.5])) for ranklm, ranktm, rankwm in zip(lmranks[i], tmranks[i], wmranks): wmlmtmranks[i].append(\ linearjoin([ranklm, ranktm, rankwm], [0.33, 0.33, 0.33])) # plot candls = ['-', '--', '-.'] # mks = ['o', '^', '*'] #for i in range(len(numtwts)): #lmeval = batcheval(lmranks[i], test['label']) #plt.plot(lmeval['pos'], lmeval['rate'], #label='tweet(s={0})'.format(numtwts[i]), #ls=candls[i%2], marker=mks[i/2]) #for i in range(len(numtwts)): #for plc in placetotalrank(lmranks[i], test)['label'][-10:]: #print place_name(plc), plc #print placetotalrank(lmranks[i], test)['totalrank'][-10:] #print wilcoxontest(lmranks[i], lmranks[i-1], test) #plt.legend(loc='lower right') #--------------------------------------------------------------- for i in range(len(numtwts)): lmeval = batcheval(lmranks[i], test['label']) plt.plot(lmeval['pos'], lmeval['rate'], label='tweet(s={0})'.format(numtwts[i]), ls=candls[i], marker='o') # wmlmeval = batcheval(wmlmranks[i], test['label']) # plt.plot(wmlmeval['pos'], wmlmeval['rate'], # label='tweet(s={0})+web'.format(numtwts[i]), # ls=candls[i], marker='^') # print 'Wilcoxon (lm vs wmlm):', wilcoxontest(lmranks[i], wmlmranks[i], test) # print 'Place id -> name:' # for plc in placetotalrank(wmlmranks[i], test)['label'][-10:]: # print place_name(plc), plc # print 'Place Total Rank:' # print placetotalrank(wmlmranks[i], test)['totalrank'][-10:] plt.plot(lmeval['pos'], [float(r) / max(lmeval['pos']) for r in lmeval['pos']], ls='-.', marker='s', label='Random Baseline') # wmeval = batcheval(wmranks, test['label']) # print 'Place id -> name:' # for plc in placetotalrank(wmranks, test)['label'][-10:]: # print place_name(plc), plc # print 'Place Total Rank' # print placetotalrank(wmranks, test)['totalrank'][-10:] # plt.plot(wmeval['pos'], wmeval['rate'], # label='web', # ls=':') #--------------------------------------------------------------- #for i in range(len(numtwts)): #plt.subplot(121 + i) #plt.title('$s={0}$'.format(numtwts[i])) #lmeval = batcheval(lmranks[i], test['label']) #plt.plot(lmeval['pos'], lmeval['rate'], #label='tweet', #ls=candls[i], marker='o') #lmtmeval = batcheval(lmtmranks[i], test['label']) #plt.plot(lmtmeval['pos'], lmtmeval['rate'], #label='tweet+time', #ls=candls[i], marker='^') #wmlmtmeval = batcheval(wmlmtmranks[i], test['label']) #plt.plot(wmlmtmeval['pos'], wmlmtmeval['rate'], #label='tweet+time+web', #ls=candls[i], marker='*') #plt.legend(loc='lower right') #plt.ylabel('Rate containing Reference POI') #plt.xlabel('Top $p$ places') #plt.show() #--------------------------------------------------------------- #i=0 #plt.subplot(121 + i) #plt.title('$s={0}$'.format(numtwts[i])) #tmeval = batcheval(tmranks[i], test['label']) #plt.plot(tmeval['pos'], tmeval['rate'], #label='time', #ls=candls[i], marker='o') #lmeval = batcheval(lmranks[i], test['label']) #plt.plot(lmeval['pos'], lmeval['rate'], #label='tweet', #ls=candls[i], marker='^') #lmtmeval = batcheval(lmtmranks[i], test['label']) #plt.plot(lmtmeval['pos'], lmtmeval['rate'], #label='tweet+time', #ls=candls[i], marker='*') #for plc in placetotalrank(tmranks[i], test)['label'][-10:]: #print place_name(plc), plc #print placetotalrank(tmranks[i], test)['totalrank'][-10:] #for plc in placetotalrank(lmtmranks[i], test)['label'][-10:]: #print place_name(plc), plc #print placetotalrank(lmtmranks[i], test)['totalrank'][-10:] #print wilcoxontest(lmranks[i], lmtmranks[i], test) #plt.legend(loc='lower right') #plt.ylabel('Rate containing Reference POI') #plt.xlabel('Top $p$ places') #i=1 #plt.subplot(121 + i) #plt.title('$s={0}$'.format(numtwts[i])) #tmeval = batcheval(tmranks[i], test['label']) #plt.plot(tmeval['pos'], tmeval['rate'], #label='time', #ls=candls[i], marker='o') #wmlmeval = batcheval(wmlmranks[i], test['label']) #plt.plot(wmlmeval['pos'], wmlmeval['rate'], #label='tweet + web', #ls=candls[i], marker='^') #wmlmtmeval = batcheval(wmlmtmranks[i], test['label']) #plt.plot(wmlmtmeval['pos'], wmlmtmeval['rate'], #label='tweet+time+web', #ls=candls[i], marker='*') #for plc in placetotalrank(wmlmranks[i], test)['label'][-10:]: #print place_name(plc), plc #print placetotalrank(wmlmranks[i], test)['totalrank'][-10:] #for plc in placetotalrank(wmlmtmranks[i], test)['label'][-10:]: #print place_name(plc), plc #print placetotalrank(wmlmtmranks[i], test)['totalrank'][-10:] #print wilcoxontest(wmlmranks[i], wmlmtmranks[i], test) #plt.legend(loc='lower right') #plt.ylabel('Rate containing Reference POI') #plt.xlabel('Top $p$ places') plt.legend(loc='lower right') plt.tight_layout() plt.show()
def cmpsparsecombine(cities, numtwts, numtest): """ the combined model performance under the influence of sparseness """ lmranks = [list() for i in range(len(numtwts))] tmranks = [list() for i in range(len(numtwts))] wmranks = list() randranks = list() lmtmranks = [list() for i in range(len(numtwts))] wmlmranks = [list() for i in range(len(numtwts))] wmlmtmranks = [list() for i in range(len(numtwts))] test = Dataset() for places in cities: lms = [dict() for i in range(len(numtwts))] tms = [dict() for i in range(len(numtwts))] wms = dict() tst = Dataset() for pid in places: twtp = loadrows( GEOTWEET, ('place_id', 'text', 'created_at'), ('place_id=\'{0}\''.format(pid), ), 'sample', 'order by rand() limit {0}'.format(max(numtwts) + numtest)) for i in range(len(numtwts)): lms[i][pid] = LanguageModel(twtp['text'][:numtwts[i]]) tms[i][pid] = TimeModel(twtp['created_at'][:numtwts[i]]) web = loadrows(GEOTWEET, ('place_id', 'web'), ('place_id=\'{0}\''.format(pid), ), 'web', 'order by rand() limit 30') wms[pid] = LanguageModel(web['web']) # test data for i in range(max(numtwts), max(numtwts) + numtest): tst.append({ 'label': pid, 'lm': LanguageModel([ twtp['text'][i], ]), 'tm': TimeModel([ twtp['created_at'][i], ]) }) test.extend(tst) # rank for item in tst: for i in range(len(numtwts)): lmranks[i].append(ranke(lms[i], item['lm'])) tmranks[i].append(ranke(tms[i], item['tm'])) wmranks.append(ranke(wms, item['lm'])) randranks.append(randranke(places)) for i in range(len(numtwts)): for ranklm, ranktm in zip(lmranks[i], tmranks[i]): lmtmranks[i].append(linearjoin([ranklm, ranktm], [0.5, 0.5])) for ranklm, rankwm in zip(lmranks[i], wmranks): wmlmranks[i].append(linearjoin([ranklm, rankwm], [0.5, 0.5])) for ranklm, ranktm, rankwm in zip(lmranks[i], tmranks[i], wmranks): wmlmtmranks[i].append(\ linearjoin([ranklm, ranktm, rankwm], [0.33, 0.33, 0.33])) # plot candls = ['-', '--'] mks = ['o', '^', '*', 'v', 's'] i = 0 plt.subplot(121 + i) plt.title('$s={0}$'.format(numtwts[i])) tmeval = batcheval(tmranks[i], test['label']) plt.plot(tmeval['pos'], tmeval['rate'], label='time', ls=candls[i], marker='o') lmeval = batcheval(lmranks[i], test['label']) plt.plot(lmeval['pos'], lmeval['rate'], label='tweet', ls=candls[i], marker='^') lmtmeval = batcheval(lmtmranks[i], test['label']) plt.plot(lmtmeval['pos'], lmtmeval['rate'], label='tweet+time', ls=candls[i], marker='*') for plc in placetotalrank(tmranks[i], test)['label'][-10:]: print place_name(plc), plc print placetotalrank(tmranks[i], test)['totalrank'][-10:] for plc in placetotalrank(lmtmranks[i], test)['label'][-10:]: print place_name(plc), plc print placetotalrank(lmtmranks[i], test)['totalrank'][-10:] print wilcoxontest(lmranks[i], lmtmranks[i], test) plt.plot(lmeval['pos'], [float(r) / max(lmeval['pos']) for r in lmeval['pos']], ls='-.', marker='s', label='Random Baseline') plt.legend(loc='lower right') plt.ylabel('Rate containing Reference POI') plt.xlabel('Top $p$ places') i = 1 plt.subplot(121 + i) plt.title('$s={0}$'.format(numtwts[i])) tmeval = batcheval(tmranks[i], test['label']) plt.plot(tmeval['pos'], tmeval['rate'], label='time', ls=candls[i], marker='o') wmlmeval = batcheval(wmlmranks[i], test['label']) plt.plot(wmlmeval['pos'], wmlmeval['rate'], label='tweet + web', ls=candls[i], marker='^') wmlmtmeval = batcheval(wmlmtmranks[i], test['label']) plt.plot(wmlmtmeval['pos'], wmlmtmeval['rate'], label='tweet+time+web', ls=candls[i], marker='*') for plc in placetotalrank(wmlmranks[i], test)['label'][-10:]: print place_name(plc), plc print placetotalrank(wmlmranks[i], test)['totalrank'][-10:] for plc in placetotalrank(wmlmtmranks[i], test)['label'][-10:]: print place_name(plc), plc print placetotalrank(wmlmtmranks[i], test)['totalrank'][-10:] print wilcoxontest(wmlmranks[i], wmlmtmranks[i], test) plt.plot(lmeval['pos'], [float(r) / max(lmeval['pos']) for r in lmeval['pos']], ls='-.', marker='s', label='Random Baseline') plt.legend(loc='lower right') plt.ylabel('Rate containing Reference POI') plt.xlabel('Top $p$ places') plt.show()
def cmptimeweb(cities, numtwts, numtest): """ compare the time model + web model to original pure text model """ lmranks = [list() for i in range(len(numtwts))] tmranks = [list() for i in range(len(numtwts))] wmranks = list() randranks = list() lmtmranks = [list() for i in range(len(numtwts))] wmlmranks = [list() for i in range(len(numtwts))] wmlmtmranks = [list() for i in range(len(numtwts))] test = Dataset() for places in cities: lms = [dict() for i in range(len(numtwts))] tms = [dict() for i in range(len(numtwts))] wms = dict() tst = Dataset() for pid in places: twtp = loadrows(GEOTWEET, ('place_id', 'text', 'created_at'), ('place_id=\'{0}\''.format(pid),), 'sample', 'order by rand() limit {0}'.format(max(numtwts) + numtest)) for i in range(len(numtwts)): lms[i][pid] = LanguageModel(twtp['text'][:numtwts[i]]) tms[i][pid] = TimeModel(twtp['created_at'][:numtwts[i]]) web = loadrows(GEOTWEET, ('place_id', 'web'), ('place_id=\'{0}\''.format(pid),), 'web', 'order by rand() limit 30') wms[pid] = LanguageModel(web['web']) # test data for i in range(max(numtwts), max(numtwts) + numtest): tst.append({'label': pid, 'lm': LanguageModel([twtp['text'][i],]), 'tm': TimeModel([twtp['created_at'][i],])}) test.extend(tst) # rank for item in tst: for i in range(len(numtwts)): lmranks[i].append(ranke(lms[i], item['lm'])) tmranks[i].append(ranke(tms[i], item['tm'])) wmranks.append(ranke(wms, item['lm'])) randranks.append(randranke(places)) for i in range(len(numtwts)): for ranklm, ranktm in zip(lmranks[i], tmranks[i]): lmtmranks[i].append(linearjoin([ranklm, ranktm], [0.5, 0.5])) for ranklm, rankwm in zip(lmranks[i], wmranks): wmlmranks[i].append(linearjoin([ranklm, rankwm], [0.5, 0.5])) for ranklm, ranktm, rankwm in zip(lmranks[i], tmranks[i], wmranks): wmlmtmranks[i].append(\ linearjoin([ranklm, ranktm, rankwm], [0.33, 0.33, 0.33])) # plot candls = ['-', '--'] mks = ['o', '^', '*', 'v', 's'] #for i in range(len(numtwts)): #lmeval = batcheval(lmranks[i], test['label']) #plt.plot(lmeval['pos'], lmeval['rate'], #label='tweet(s={0})'.format(numtwts[i]), #ls=candls[i%2], marker=mks[i/2]) #for i in range(len(numtwts)): #for plc in placetotalrank(lmranks[i], test)['label'][-10:]: #print place_name(plc), plc #print placetotalrank(lmranks[i], test)['totalrank'][-10:] #print wilcoxontest(lmranks[i], lmranks[i-1], test) #plt.legend(loc='lower right') #--------------------------------------------------------------- for i in range(len(numtwts)): lmeval = batcheval(lmranks[i], test['label']) plt.plot(lmeval['pos'], lmeval['rate'], label='tweet(s={0})'.format(numtwts[i]), ls=candls[i], marker='o') wmlmeval = batcheval(wmlmranks[i], test['label']) plt.plot(wmlmeval['pos'], wmlmeval['rate'], label='tweet(s={0})+web'.format(numtwts[i]), ls=candls[i], marker='^') print wilcoxontest(lmranks[i], wmlmranks[i], test) for plc in placetotalrank(wmlmranks[i], test)['label'][-10:]: print place_name(plc), plc print placetotalrank(wmlmranks[i], test)['totalrank'][-10:] wmeval = batcheval(wmranks, test['label']) for plc in placetotalrank(wmranks, test)['label'][-10:]: print place_name(plc), plc print placetotalrank(wmranks, test)['totalrank'][-10:] plt.plot(wmeval['pos'], wmeval['rate'], label='web', ls=':') plt.plot(lmeval['pos'], [float(r) / max(lmeval['pos']) for r in lmeval['pos']], ls='-.', marker='s', label='Random Baseline') #--------------------------------------------------------------- #for i in range(len(numtwts)): #plt.subplot(121 + i) #plt.title('$s={0}$'.format(numtwts[i])) #lmeval = batcheval(lmranks[i], test['label']) #plt.plot(lmeval['pos'], lmeval['rate'], #label='tweet', #ls=candls[i], marker='o') #lmtmeval = batcheval(lmtmranks[i], test['label']) #plt.plot(lmtmeval['pos'], lmtmeval['rate'], #label='tweet+time', #ls=candls[i], marker='^') #wmlmtmeval = batcheval(wmlmtmranks[i], test['label']) #plt.plot(wmlmtmeval['pos'], wmlmtmeval['rate'], #label='tweet+time+web', #ls=candls[i], marker='*') #plt.legend(loc='lower right') #plt.ylabel('Rate containing Reference POI') #plt.xlabel('Top $p$ places') #plt.show() #--------------------------------------------------------------- #i=0 #plt.subplot(121 + i) #plt.title('$s={0}$'.format(numtwts[i])) #tmeval = batcheval(tmranks[i], test['label']) #plt.plot(tmeval['pos'], tmeval['rate'], #label='time', #ls=candls[i], marker='o') #lmeval = batcheval(lmranks[i], test['label']) #plt.plot(lmeval['pos'], lmeval['rate'], #label='tweet', #ls=candls[i], marker='^') #lmtmeval = batcheval(lmtmranks[i], test['label']) #plt.plot(lmtmeval['pos'], lmtmeval['rate'], #label='tweet+time', #ls=candls[i], marker='*') #for plc in placetotalrank(tmranks[i], test)['label'][-10:]: #print place_name(plc), plc #print placetotalrank(tmranks[i], test)['totalrank'][-10:] #for plc in placetotalrank(lmtmranks[i], test)['label'][-10:]: #print place_name(plc), plc #print placetotalrank(lmtmranks[i], test)['totalrank'][-10:] #print wilcoxontest(lmranks[i], lmtmranks[i], test) #plt.legend(loc='lower right') #plt.ylabel('Rate containing Reference POI') #plt.xlabel('Top $p$ places') #i=1 #plt.subplot(121 + i) #plt.title('$s={0}$'.format(numtwts[i])) #tmeval = batcheval(tmranks[i], test['label']) #plt.plot(tmeval['pos'], tmeval['rate'], #label='time', #ls=candls[i], marker='o') #wmlmeval = batcheval(wmlmranks[i], test['label']) #plt.plot(wmlmeval['pos'], wmlmeval['rate'], #label='tweet + web', #ls=candls[i], marker='^') #wmlmtmeval = batcheval(wmlmtmranks[i], test['label']) #plt.plot(wmlmtmeval['pos'], wmlmtmeval['rate'], #label='tweet+time+web', #ls=candls[i], marker='*') #for plc in placetotalrank(wmlmranks[i], test)['label'][-10:]: #print place_name(plc), plc #print placetotalrank(wmlmranks[i], test)['totalrank'][-10:] #for plc in placetotalrank(wmlmtmranks[i], test)['label'][-10:]: #print place_name(plc), plc #print placetotalrank(wmlmtmranks[i], test)['totalrank'][-10:] #print wilcoxontest(wmlmranks[i], wmlmtmranks[i], test) plt.legend(loc='lower right') plt.ylabel('Rate containing Reference POI') plt.xlabel('Top $p$ places') plt.show()