Example #1
0
def region2arff(dst, region):
    """Generate data in the region in arff format"""
    twt_lst = dataset.loadrows(GEOTWEET, ("place_id", "text"), ("MBRContains({0}, geo)".format(geo_rect(*region)),))
    vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt["text"]))) for twt in twt_lst])
    bgdist = vec_lst.bgdist()
    # cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append("__CLASS__")
    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = twt_lst[i]["place_id"]
    vec_lst.gen_arff(dst, keylist)
Example #2
0
def region2arff(dst, region):
    """Generate data in the region in arff format"""
    twt_lst = dataset.loadrows(
        GEOTWEET, ('place_id', 'text'),
        ('MBRContains({0}, geo)'.format(geo_rect(*region)), ))
    vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt['text']))) \
            for twt in twt_lst])
    bgdist = vec_lst.bgdist()
    #cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append('__CLASS__')
    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = twt_lst[i]['place_id']
    vec_lst.gen_arff(dst, keylist)
Example #3
0
def top_poi_100_crs(dst, city, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ('id',), \
            ('superior_name=\'{0}\''.format(city),), 'sample_dist_100')
    twt_lst = dataset.load_by_place([plc['id'] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt['text']))))
        if twt['place_id'] not in places:
            place = dataset.DataItem()
            place['id'] = twt['place_id']
            place['label'] = str(len(places))
            place['name'] = twt['name']
            place['category'] = twt['category']
            place['super_category'] = twt['super_category']
            places[twt['place_id']] = place

    #output the places as json objects
    with open(dst + '.place', 'w') as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    #output the tweets as json objects
    with open(dst + '.tweet', 'w') as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    #cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append('__NO__')
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])

    keylist.append('__CLASS__')
    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = places[twt_lst[i]['place_id']][col]
        vec_lst[i]['__NO__'] = i
    vec_lst.gen_crs_arff(dst, 5, keylist)
Example #4
0
def cate_smoothed_100(dst, city):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ("id",), ("superior_name='{0}'".format(city),), "sample_dist_100")
    twt_lst = dataset.load_by_place([plc["id"] for plc in plst])

    twt_lst = smoothing.cate_smooth(twt_lst, 1, lambda x, y: True)

    vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt["text"]))) for twt in twt_lst])

    bgdist = vec_lst.bgdist()

    # cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append("__CLASS__")

    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = name_filter(dataset.place_name(twt_lst[i]["place_id"], GEOTWEET))
    statistics.class_dist(vec)
    vec_lst.gen_arff(dst, keylist)
Example #5
0
def top_poi_100_crs(dst, city, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ("id",), ("superior_name='{0}'".format(city),), "sample_dist_100")
    twt_lst = dataset.load_by_place([plc["id"] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt["text"]))))
        if twt["place_id"] not in places:
            place = dataset.DataItem()
            place["id"] = twt["place_id"]
            place["label"] = str(len(places))
            place["name"] = twt["name"]
            place["category"] = twt["category"]
            place["super_category"] = twt["super_category"]
            places[twt["place_id"]] = place

    # output the places as json objects
    with open(dst + ".place", "w") as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    # output the tweets as json objects
    with open(dst + ".tweet", "w") as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    # cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append("__NO__")
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])

    keylist.append("__CLASS__")
    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = places[twt_lst[i]["place_id"]][col]
        vec_lst[i]["__NO__"] = i
    vec_lst.gen_crs_arff(dst, 5, keylist)
Example #6
0
def cate_smoothed_100(dst, city):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ('id',), \
            ('superior_name=\'{0}\''.format(city),), 'sample_dist_100')
    twt_lst = dataset.load_by_place([plc['id'] for plc in plst])

    twt_lst = smoothing.cate_smooth(twt_lst, 1, lambda x, y: True)

    vec_lst = dataset.Dataset(
        [line2tf(comma_filter(fourq_filter(twt['text']))) for twt in twt_lst])

    bgdist = vec_lst.bgdist()

    #cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append('__CLASS__')

    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = name_filter(
            dataset.place_name(twt_lst[i]['place_id'], GEOTWEET))
    statistics.class_dist(vec)
    vec_lst.gen_arff(dst, keylist)
Example #7
0
def all_poi_100(dst, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ("id",), None, "sample_dist_100")
    twt_lst = dataset.load_by_place([plc["id"] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt["text"]))))
        if twt["place_id"] not in places:
            place = dataset.DataItem()
            place["id"] = twt["place_id"]
            place["label"] = str(len(places))
            place["name"] = twt["name"]
            place["category"] = twt["category"]
            place["super_category"] = twt["super_category"]
            places[twt["place_id"]] = place

    # output the places as json objects
    with open(dst + ".place", "w") as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    # output the tweets as json objects
    with open(dst + ".tweet", "w") as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    # cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append("__NO__")
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])
    keylist.append("__CLASS__")

    # add idf divisor
    # idf = vec_lst.idf()
    # for vec in vec_lst:
    # for key in vec.iterkeys():
    # vec[key] = vec[key]/math.log(float(idf[key])+1)

    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = name_filter(places[twt_lst[i]["place_id"]][col])
        vec_lst[i]["__NO__"] = i

    # def wdist(vec_lst):
    # """get the back ground distribution"""
    # bgdist = dict()
    # for vec in vec_lst:
    # for key in vec.iterkeys():
    # if key in bgdist:
    # bgdist[key] += vec[key]
    # else:
    # bgdist[key] = vec[key]
    # return bgdist
    # wpdist = vec_lst.groupfunc('__CLASS__', wdist)
    # for key in wpdist.iterkeys():
    # print (sorted(wpdist[key], key=itemgetter(1), reverse=True))[0:10]
    vec_lst.gen_arff(dst, keylist)
Example #8
0
def all_poi_100(dst, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ('id',), \
            None, 'sample_dist_100')
    twt_lst = dataset.load_by_place([plc['id'] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt['text']))))
        if twt['place_id'] not in places:
            place = dataset.DataItem()
            place['id'] = twt['place_id']
            place['label'] = str(len(places))
            place['name'] = twt['name']
            place['category'] = twt['category']
            place['super_category'] = twt['super_category']
            places[twt['place_id']] = place

    #output the places as json objects
    with open(dst + '.place', 'w') as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    #output the tweets as json objects
    with open(dst + '.tweet', 'w') as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    #cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append('__NO__')
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])
    keylist.append('__CLASS__')

    # add idf divisor
    #idf = vec_lst.idf()
    #for vec in vec_lst:
    #for key in vec.iterkeys():
    #vec[key] = vec[key]/math.log(float(idf[key])+1)

    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = name_filter(
            places[twt_lst[i]['place_id']][col])
        vec_lst[i]['__NO__'] = i

    #def wdist(vec_lst):
    #"""get the back ground distribution"""
    #bgdist = dict()
    #for vec in vec_lst:
    #for key in vec.iterkeys():
    #if key in bgdist:
    #bgdist[key] += vec[key]
    #else:
    #bgdist[key] = vec[key]
    #return bgdist
    #wpdist = vec_lst.groupfunc('__CLASS__', wdist)
    #for key in wpdist.iterkeys():
    #print (sorted(wpdist[key], key=itemgetter(1), reverse=True))[0:10]
    vec_lst.gen_arff(dst, keylist)