Beispiel #1
0
def err_analyze(dst, mat, twtf, plcf, col):
    """output csv for mat"""
    twt_lst = dataset.Dataset()
    with open(twtf) as ftwt:
        for line in ftwt:
            twt_lst.append(json.loads(line))

    places = dataset.DataItem()
    with open(plcf) as fplc:
        for line in fplc:
            place = json.loads(line)
            places[place[col]] = place

    with open(dst, 'w') as fdst:
        print >>fdst, '"Ref POI", "Hyp POI", "Text", "Ref Genre", "Hyp Genre", "Ref SGenre", "Hyp SGenre"'
        for i in mat:
            for j in mat:
                #if i != j:
                    for item in mat[i][j]:
                        #              ref    hyp  text  rcat  hcat   rsc   hsc
                        try:
                            print >>fdst, '"{0}","{1}","{2}","{3}","{4}","{5}","{6}"' \
                                .format(csv_filter(places[i]['name']),csv_filter(places[j]['name']), \
                                fourq_filter(csv_filter(twt_lst[item]['text'])), \
                                places[i]['category'],places[j]['category'], \
                                places[i]['super_category'], places[j]['super_category'])
                        except: pass
Beispiel #2
0
def region2arff(dst, region):
    """Generate data in the region in arff format"""
    twt_lst = dataset.loadrows(GEOTWEET, ("place_id", "text"), ("MBRContains({0}, geo)".format(geo_rect(*region)),))
    vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt["text"]))) for twt in twt_lst])
    bgdist = vec_lst.bgdist()
    # cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append("__CLASS__")
    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = twt_lst[i]["place_id"]
    vec_lst.gen_arff(dst, keylist)
Beispiel #3
0
def region2arff(dst, region):
    """Generate data in the region in arff format"""
    twt_lst = dataset.loadrows(
        GEOTWEET, ('place_id', 'text'),
        ('MBRContains({0}, geo)'.format(geo_rect(*region)), ))
    vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt['text']))) \
            for twt in twt_lst])
    bgdist = vec_lst.bgdist()
    #cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append('__CLASS__')
    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = twt_lst[i]['place_id']
    vec_lst.gen_arff(dst, keylist)
Beispiel #4
0
def top_poi_100_crs(dst, city, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ('id',), \
            ('superior_name=\'{0}\''.format(city),), 'sample_dist_100')
    twt_lst = dataset.load_by_place([plc['id'] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt['text']))))
        if twt['place_id'] not in places:
            place = dataset.DataItem()
            place['id'] = twt['place_id']
            place['label'] = str(len(places))
            place['name'] = twt['name']
            place['category'] = twt['category']
            place['super_category'] = twt['super_category']
            places[twt['place_id']] = place

    #output the places as json objects
    with open(dst + '.place', 'w') as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    #output the tweets as json objects
    with open(dst + '.tweet', 'w') as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    #cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append('__NO__')
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])

    keylist.append('__CLASS__')
    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = places[twt_lst[i]['place_id']][col]
        vec_lst[i]['__NO__'] = i
    vec_lst.gen_crs_arff(dst, 5, keylist)
Beispiel #5
0
def cate_smoothed_100(dst, city):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ("id",), ("superior_name='{0}'".format(city),), "sample_dist_100")
    twt_lst = dataset.load_by_place([plc["id"] for plc in plst])

    twt_lst = smoothing.cate_smooth(twt_lst, 1, lambda x, y: True)

    vec_lst = dataset.Dataset([line2tf(comma_filter(fourq_filter(twt["text"]))) for twt in twt_lst])

    bgdist = vec_lst.bgdist()

    # cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append("__CLASS__")

    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = name_filter(dataset.place_name(twt_lst[i]["place_id"], GEOTWEET))
    statistics.class_dist(vec)
    vec_lst.gen_arff(dst, keylist)
Beispiel #6
0
def top_poi_100_crs(dst, city, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ("id",), ("superior_name='{0}'".format(city),), "sample_dist_100")
    twt_lst = dataset.load_by_place([plc["id"] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt["text"]))))
        if twt["place_id"] not in places:
            place = dataset.DataItem()
            place["id"] = twt["place_id"]
            place["label"] = str(len(places))
            place["name"] = twt["name"]
            place["category"] = twt["category"]
            place["super_category"] = twt["super_category"]
            places[twt["place_id"]] = place

    # output the places as json objects
    with open(dst + ".place", "w") as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    # output the tweets as json objects
    with open(dst + ".tweet", "w") as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    # cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append("__NO__")
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])

    keylist.append("__CLASS__")
    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = places[twt_lst[i]["place_id"]][col]
        vec_lst[i]["__NO__"] = i
    vec_lst.gen_crs_arff(dst, 5, keylist)
Beispiel #7
0
def cate_smoothed_100(dst, city):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ('id',), \
            ('superior_name=\'{0}\''.format(city),), 'sample_dist_100')
    twt_lst = dataset.load_by_place([plc['id'] for plc in plst])

    twt_lst = smoothing.cate_smooth(twt_lst, 1, lambda x, y: True)

    vec_lst = dataset.Dataset(
        [line2tf(comma_filter(fourq_filter(twt['text']))) for twt in twt_lst])

    bgdist = vec_lst.bgdist()

    #cut all the key that only appear less than 3 times
    keylist = [key for key in bgdist.iterkeys() if bgdist[key] > 3]

    keylist.append('__CLASS__')

    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = name_filter(
            dataset.place_name(twt_lst[i]['place_id'], GEOTWEET))
    statistics.class_dist(vec)
    vec_lst.gen_arff(dst, keylist)
Beispiel #8
0
def all_poi_100(dst, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ("id",), None, "sample_dist_100")
    twt_lst = dataset.load_by_place([plc["id"] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt["text"]))))
        if twt["place_id"] not in places:
            place = dataset.DataItem()
            place["id"] = twt["place_id"]
            place["label"] = str(len(places))
            place["name"] = twt["name"]
            place["category"] = twt["category"]
            place["super_category"] = twt["super_category"]
            places[twt["place_id"]] = place

    # output the places as json objects
    with open(dst + ".place", "w") as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    # output the tweets as json objects
    with open(dst + ".tweet", "w") as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    # cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append("__NO__")
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])
    keylist.append("__CLASS__")

    # add idf divisor
    # idf = vec_lst.idf()
    # for vec in vec_lst:
    # for key in vec.iterkeys():
    # vec[key] = vec[key]/math.log(float(idf[key])+1)

    for i in range(len(vec_lst)):
        vec_lst[i]["__CLASS__"] = name_filter(places[twt_lst[i]["place_id"]][col])
        vec_lst[i]["__NO__"] = i

    # def wdist(vec_lst):
    # """get the back ground distribution"""
    # bgdist = dict()
    # for vec in vec_lst:
    # for key in vec.iterkeys():
    # if key in bgdist:
    # bgdist[key] += vec[key]
    # else:
    # bgdist[key] = vec[key]
    # return bgdist
    # wpdist = vec_lst.groupfunc('__CLASS__', wdist)
    # for key in wpdist.iterkeys():
    # print (sorted(wpdist[key], key=itemgetter(1), reverse=True))[0:10]
    vec_lst.gen_arff(dst, keylist)
Beispiel #9
0
def all_poi_100(dst, col):
    """Select all POIs from New York as the candidates"""
    plst = dataset.loadrows(GEOTWEET, ('id',), \
            None, 'sample_dist_100')
    twt_lst = dataset.load_by_place([plc['id'] for plc in plst])

    places = dataset.DataItem()
    vec_lst = dataset.Dataset()
    for twt in twt_lst:
        vec_lst.append(line2tf(comma_filter(fourq_filter(twt['text']))))
        if twt['place_id'] not in places:
            place = dataset.DataItem()
            place['id'] = twt['place_id']
            place['label'] = str(len(places))
            place['name'] = twt['name']
            place['category'] = twt['category']
            place['super_category'] = twt['super_category']
            places[twt['place_id']] = place

    #output the places as json objects
    with open(dst + '.place', 'w') as fplc:
        for key in places:
            print >> fplc, json.dumps(places[key])

    #output the tweets as json objects
    with open(dst + '.tweet', 'w') as ftwt:
        i = 0
        for twt in twt_lst:
            print >> ftwt, json.dumps(twt)

    #cut all the key that only appear less than 3 times
    bgdist = vec_lst.bgdist()

    keylist = list()
    keylist.append('__NO__')
    keylist.extend([key for key in bgdist.iterkeys() if bgdist[key] > 3])
    keylist.append('__CLASS__')

    # add idf divisor
    #idf = vec_lst.idf()
    #for vec in vec_lst:
    #for key in vec.iterkeys():
    #vec[key] = vec[key]/math.log(float(idf[key])+1)

    for i in range(len(vec_lst)):
        vec_lst[i]['__CLASS__'] = name_filter(
            places[twt_lst[i]['place_id']][col])
        vec_lst[i]['__NO__'] = i

    #def wdist(vec_lst):
    #"""get the back ground distribution"""
    #bgdist = dict()
    #for vec in vec_lst:
    #for key in vec.iterkeys():
    #if key in bgdist:
    #bgdist[key] += vec[key]
    #else:
    #bgdist[key] = vec[key]
    #return bgdist
    #wpdist = vec_lst.groupfunc('__CLASS__', wdist)
    #for key in wpdist.iterkeys():
    #print (sorted(wpdist[key], key=itemgetter(1), reverse=True))[0:10]
    vec_lst.gen_arff(dst, keylist)