def region2arff(dst, region): """Generate data in the region in arff format""" twt_lst = dataset.loadrows(GEOTWEET, ("place_id", "text"), ("MBRContains({0}, geo)".format(geo_rect(*region)),)) vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt["text"]))) for twt in twt_lst]) bgdist = vec_lst.bgdist() # cut all the key that only appear less than 3 times keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3] keylist.append("__CLASS__") for i in range(len(vec_lst)): vec_lst[i]["__CLASS__"] = twt_lst[i]["place_id"] vec_lst.gen_arff(dst, keylist)
def region2arff(dst, region): """Generate data in the region in arff format""" twt_lst = dataset.loadrows( GEOTWEET, ('place_id', 'text'), ('MBRContains({0}, geo)'.format(geo_rect(*region)), )) vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt['text']))) \ for twt in twt_lst]) bgdist = vec_lst.bgdist() #cut all the key that only appear less than 3 times keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3] keylist.append('__CLASS__') for i in range(len(vec_lst)): vec_lst[i]['__CLASS__'] = twt_lst[i]['place_id'] vec_lst.gen_arff(dst, keylist)
def top_poi_100_crs(dst, city, col): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ('id',), \ ('superior_name=\'{0}\''.format(city),), 'sample_dist_100') twt_lst = dataset.load_by_place([plc['id'] for plc in plst]) places = dataset.DataItem() vec_lst = dataset.Dataset() for twt in twt_lst: vec_lst.append(line2tf(comma_filter(fourq_filter(twt['text'])))) if twt['place_id'] not in places: place = dataset.DataItem() place['id'] = twt['place_id'] place['label'] = str(len(places)) place['name'] = twt['name'] place['category'] = twt['category'] place['super_category'] = twt['super_category'] places[twt['place_id']] = place #output the places as json objects with open(dst + '.place', 'w') as fplc: for key in places: print >> fplc, json.dumps(places[key]) #output the tweets as json objects with open(dst + '.tweet', 'w') as ftwt: i = 0 for twt in twt_lst: print >> ftwt, json.dumps(twt) #cut all the key that only appear less than 3 times bgdist = vec_lst.bgdist() keylist = list() keylist.append('__NO__') keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3]) keylist.append('__CLASS__') for i in range(len(vec_lst)): vec_lst[i]['__CLASS__'] = places[twt_lst[i]['place_id']][col] vec_lst[i]['__NO__'] = i vec_lst.gen_crs_arff(dst, 5, keylist)
def cate_smoothed_100(dst, city): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ("id",), ("superior_name='{0}'".format(city),), "sample_dist_100") twt_lst = dataset.load_by_place([plc["id"] for plc in plst]) twt_lst = smoothing.cate_smooth(twt_lst, 1, lambda x, y: True) vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt["text"]))) for twt in twt_lst]) bgdist = vec_lst.bgdist() # cut all the key that only appear less than 3 times keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3] keylist.append("__CLASS__") for i in range(len(vec_lst)): vec_lst[i]["__CLASS__"] = name_filter(dataset.place_name(twt_lst[i]["place_id"], GEOTWEET)) statistics.class_dist(vec) vec_lst.gen_arff(dst, keylist)
def top_poi_100_crs(dst, city, col): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ("id",), ("superior_name='{0}'".format(city),), "sample_dist_100") twt_lst = dataset.load_by_place([plc["id"] for plc in plst]) places = dataset.DataItem() vec_lst = dataset.Dataset() for twt in twt_lst: vec_lst.append(line2tf(comma_filter(fourq_filter(twt["text"])))) if twt["place_id"] not in places: place = dataset.DataItem() place["id"] = twt["place_id"] place["label"] = str(len(places)) place["name"] = twt["name"] place["category"] = twt["category"] place["super_category"] = twt["super_category"] places[twt["place_id"]] = place # output the places as json objects with open(dst + ".place", "w") as fplc: for key in places: print >> fplc, json.dumps(places[key]) # output the tweets as json objects with open(dst + ".tweet", "w") as ftwt: i = 0 for twt in twt_lst: print >> ftwt, json.dumps(twt) # cut all the key that only appear less than 3 times bgdist = vec_lst.bgdist() keylist = list() keylist.append("__NO__") keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3]) keylist.append("__CLASS__") for i in range(len(vec_lst)): vec_lst[i]["__CLASS__"] = places[twt_lst[i]["place_id"]][col] vec_lst[i]["__NO__"] = i vec_lst.gen_crs_arff(dst, 5, keylist)
def cate_smoothed_100(dst, city): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ('id',), \ ('superior_name=\'{0}\''.format(city),), 'sample_dist_100') twt_lst = dataset.load_by_place([plc['id'] for plc in plst]) twt_lst = smoothing.cate_smooth(twt_lst, 1, lambda x, y: True) vec_lst = dataset.Dataset( [line2tf(comma_filter(fourq_filter(twt['text']))) for twt in twt_lst]) bgdist = vec_lst.bgdist() #cut all the key that only appear less than 3 times keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3] keylist.append('__CLASS__') for i in range(len(vec_lst)): vec_lst[i]['__CLASS__'] = name_filter( dataset.place_name(twt_lst[i]['place_id'], GEOTWEET)) statistics.class_dist(vec) vec_lst.gen_arff(dst, keylist)
def all_poi_100(dst, col): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ("id",), None, "sample_dist_100") twt_lst = dataset.load_by_place([plc["id"] for plc in plst]) places = dataset.DataItem() vec_lst = dataset.Dataset() for twt in twt_lst: vec_lst.append(line2tf(comma_filter(fourq_filter(twt["text"])))) if twt["place_id"] not in places: place = dataset.DataItem() place["id"] = twt["place_id"] place["label"] = str(len(places)) place["name"] = twt["name"] place["category"] = twt["category"] place["super_category"] = twt["super_category"] places[twt["place_id"]] = place # output the places as json objects with open(dst + ".place", "w") as fplc: for key in places: print >> fplc, json.dumps(places[key]) # output the tweets as json objects with open(dst + ".tweet", "w") as ftwt: i = 0 for twt in twt_lst: print >> ftwt, json.dumps(twt) # cut all the key that only appear less than 3 times bgdist = vec_lst.bgdist() keylist = list() keylist.append("__NO__") keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3]) keylist.append("__CLASS__") # add idf divisor # idf = vec_lst.idf() # for vec in vec_lst: # for key in vec.iterkeys(): # vec[key] = vec[key]/math.log(float(idf[key])+1) for i in range(len(vec_lst)): vec_lst[i]["__CLASS__"] = name_filter(places[twt_lst[i]["place_id"]][col]) vec_lst[i]["__NO__"] = i # def wdist(vec_lst): # """get the back ground distribution""" # bgdist = dict() # for vec in vec_lst: # for key in vec.iterkeys(): # if key in bgdist: # bgdist[key] += vec[key] # else: # bgdist[key] = vec[key] # return bgdist # wpdist = vec_lst.groupfunc('__CLASS__', wdist) # for key in wpdist.iterkeys(): # print (sorted(wpdist[key], key=itemgetter(1), reverse=True))[0:10] vec_lst.gen_arff(dst, keylist)
def all_poi_100(dst, col): """Select all POIs from New York as the candidates""" plst = dataset.loadrows(GEOTWEET, ('id',), \ None, 'sample_dist_100') twt_lst = dataset.load_by_place([plc['id'] for plc in plst]) places = dataset.DataItem() vec_lst = dataset.Dataset() for twt in twt_lst: vec_lst.append(line2tf(comma_filter(fourq_filter(twt['text'])))) if twt['place_id'] not in places: place = dataset.DataItem() place['id'] = twt['place_id'] place['label'] = str(len(places)) place['name'] = twt['name'] place['category'] = twt['category'] place['super_category'] = twt['super_category'] places[twt['place_id']] = place #output the places as json objects with open(dst + '.place', 'w') as fplc: for key in places: print >> fplc, json.dumps(places[key]) #output the tweets as json objects with open(dst + '.tweet', 'w') as ftwt: i = 0 for twt in twt_lst: print >> ftwt, json.dumps(twt) #cut all the key that only appear less than 3 times bgdist = vec_lst.bgdist() keylist = list() keylist.append('__NO__') keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3]) keylist.append('__CLASS__') # add idf divisor #idf = vec_lst.idf() #for vec in vec_lst: #for key in vec.iterkeys(): #vec[key] = vec[key]/math.log(float(idf[key])+1) for i in range(len(vec_lst)): vec_lst[i]['__CLASS__'] = name_filter( places[twt_lst[i]['place_id']][col]) vec_lst[i]['__NO__'] = i #def wdist(vec_lst): #"""get the back ground distribution""" #bgdist = dict() #for vec in vec_lst: #for key in vec.iterkeys(): #if key in bgdist: #bgdist[key] += vec[key] #else: #bgdist[key] = vec[key] #return bgdist #wpdist = vec_lst.groupfunc('__CLASS__', wdist) #for key in wpdist.iterkeys(): #print (sorted(wpdist[key], key=itemgetter(1), reverse=True))[0:10] vec_lst.gen_arff(dst, keylist)