Example #1
0
def evaluate():
    fw = open("../result/eval.results", "w")
    flag = True
    counter = 0
    for df in data_files:
        counter += 1
        df = os.path.join(data_dir, df)
        jstr = open(df).read()
        res_dict = proxy.geolocate_cli(jstr, True)
        err = res_dict["error"]
        if err:
            print err
            continue
        sname = res_dict["sname"].lower()
        oconf = res_dict["oconf"]
        # here to ensure 10 geotagged tweets, and 50% hurdle
        if oconf != 2:
            continue
        pc = res_dict["pc"]
        oc = res_dict["oc"]
        plat, plon = lib_grid_search.lookup_coords(pc)
        olat, olon = lib_grid_search.lookup_coords(oc)

        error = gcd_dist.calc_dist_degree(plat, plon, olat, olon)
        error_list.append(error)
        if oc[-2:] == pc[-2:]:
            cc_list.append(True)
        text_pred = res_dict["text_pred"]
        loc_pred = res_dict["loc_pred"]
        tz_pred = res_dict["tz_pred"]
        tweet_num = len(res_dict["tweets"])
        rec = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(
            sname, oc, pc, text_pred, loc_pred, tz_pred, error, tweet_num)
        print counter, rec,
        fw.write(rec)

    divisor = float(len(error_list))
    acc = len([error for error in error_list if error == 0.0]) / divisor
    cacc = sum(1.0 for c in cc_list if c) / divisor
    acc161 = len([error for error in error_list if error <= 161.0]) / divisor
    mean = sum(error_list) / divisor
    median = sorted(error_list)[int(divisor) / 2]
    fw.write("Acc: {0}\n".format(acc))
    print "Acc: {0}\n".format(acc),
    fw.write("Acc161: {0}\n".format(acc161))
    print "Acc161: {0}\n".format(acc161),
    fw.write("AccCountry: {0}\n".format(cacc))
    print "AccCountry: {0}\n".format(cacc),
    fw.write("Median: {0}\n".format(median))
    print "Median: {0}\n".format(median),
    fw.write("Mean: {0}\n".format(mean))
    print "Mean: {0}\n".format(mean),
    fw.close()
    print "All finished"
Example #2
0
def evaluate():
    fw = open("../result/eval.results", "w")
    flag = True
    counter = 0
    for df in data_files:
        counter += 1
        df = os.path.join(data_dir, df)
        jstr = open(df).read()
        res_dict = proxy.geolocate_cli(jstr, True)
        err = res_dict["error"]
        if err:
            print err
            continue
        sname = res_dict["sname"].lower()
        oconf = res_dict["oconf"]
        # here to ensure 10 geotagged tweets, and 50% hurdle
        if oconf != 2:
            continue
        pc = res_dict["pc"]
        oc = res_dict["oc"]
        plat, plon = lib_grid_search.lookup_coords(pc)
        olat, olon = lib_grid_search.lookup_coords(oc)

        error = gcd_dist.calc_dist_degree(plat, plon, olat, olon)
        error_list.append(error)
        if oc[-2:] == pc[-2:]:
            cc_list.append(True)
        text_pred = res_dict["text_pred"]
        loc_pred = res_dict["loc_pred"]
        tz_pred = res_dict["tz_pred"]
        tweet_num = len(res_dict["tweets"])
        rec = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(sname, oc, pc, text_pred, loc_pred, tz_pred, error, tweet_num)
        print counter, rec,
        fw.write(rec)

    divisor = float(len(error_list))
    acc = len([error for error in error_list if error == 0.0]) / divisor
    cacc = sum(1.0 for c in cc_list if c) / divisor
    acc161 = len([error for error in error_list if error <= 161.0]) / divisor
    mean = sum(error_list) / divisor
    median = sorted(error_list)[int(divisor)/2]
    fw.write("Acc: {0}\n".format(acc))
    print "Acc: {0}\n".format(acc),
    fw.write("Acc161: {0}\n".format(acc161))
    print "Acc161: {0}\n".format(acc161),
    fw.write("AccCountry: {0}\n".format(cacc))
    print "AccCountry: {0}\n".format(cacc),
    fw.write("Median: {0}\n".format(median))
    print "Median: {0}\n".format(median),
    fw.write("Mean: {0}\n".format(mean))
    print "Mean: {0}\n".format(mean),
    fw.close()
    print "All finished"
Example #3
0
def distill_data(statuses, distill_func):
    err_msg = None
    gt_dict = dict()
    gt_dict["oc"] = []  #short for oracle city
    gt_dict["footprints"] = []
    gt_dict["loc"] = []
    gt_dict["tz"] = []
    gt_dict["desc"] = []
    gt_dict["tweets"] = []
    one_off_flag = True
    for status in statuses:
        jobj, err_msg = distill_func(status)
        if err_msg and not jobj:
            return (None, err_msg)
        text = anonymise_text(jobj["text"])
        loc = jobj["loc"]
        tz = jobj["tz"]
        desc = jobj["desc"]
        lat = jobj["lat"]
        lon = jobj["lon"]
        city = None
        if lat and lon:
            city = lib_grid_search.lookup_city(lat, lon)
        if city:
            gt_dict["oc"].append(city)
            gt_dict["footprints"].append((city, lat, lon, text))
        if loc:
            gt_dict["loc"].append(loc)
        if tz:
            gt_dict["tz"].append(tz)
        if desc:
            gt_dict["desc"].append(desc)

        if one_off_flag:
            gt_dict["rname"] = jobj["rname"]
            gt_dict["sname"] = jobj["sname"]
            one_off_flag = False
        gt_dict["tweets"].append(text)
    gt_dict["loc"] = lib_util.most_freq_item(
        gt_dict["loc"]) if gt_dict["loc"] else ""
    gt_dict["tz"] = lib_util.most_freq_item(
        gt_dict["tz"]) if gt_dict["tz"] else ""
    gt_dict["desc"] = lib_util.most_freq_item(
        gt_dict["desc"]) if gt_dict["desc"] else ""
    gt_dict["oc"] = lib_util.most_freq_item(
        gt_dict["oc"]) if gt_dict["oc"] else None
    oc = gt_dict["oc"]
    gt_dict["oconf"] = evaluate_oracle_confidence(oc, gt_dict["footprints"])
    if oc:
        olat, olon = lib_grid_search.lookup_coords(oc)
        gt_dict["olat"] = olat
        gt_dict["olon"] = olon
    else:
        gt_dict["olat"] = None
        gt_dict["olon"] = None
    gt_dict["error"] = err_msg
    assert (not err_msg)
    return (gt_dict, err_msg)
Example #4
0
def distill_data(statuses, distill_func):
    err_msg = None
    gt_dict = dict()
    gt_dict["oc"] = [] #short for oracle city
    gt_dict["footprints"] = []
    gt_dict["loc"] = []
    gt_dict["tz"] = []
    gt_dict["desc"] = []
    gt_dict["tweets"] = []
    one_off_flag = True
    for status in statuses:
        jobj, err_msg = distill_func(status)
        if err_msg and not jobj:
            return (None, err_msg)
        text = anonymise_text(jobj["text"])
        loc = jobj["loc"]
        tz = jobj["tz"]
        desc = jobj["desc"]
        lat = jobj["lat"]
        lon = jobj["lon"]
        city = None
        if lat and lon:
            city = lib_grid_search.lookup_city(lat, lon)
        if city:
            gt_dict["oc"].append(city)
            gt_dict["footprints"].append((city, lat, lon, text))
        if loc:
            gt_dict["loc"].append(loc)
        if tz:
            gt_dict["tz"].append(tz)
        if desc:
            gt_dict["desc"].append(desc)

        if one_off_flag:
            gt_dict["rname"] = jobj["rname"]
            gt_dict["sname"] = jobj["sname"]
            one_off_flag = False
        gt_dict["tweets"].append(text)
    gt_dict["loc"] = lib_util.most_freq_item(gt_dict["loc"]) if gt_dict["loc"] else ""
    gt_dict["tz"] = lib_util.most_freq_item(gt_dict["tz"]) if gt_dict["tz"] else ""
    gt_dict["desc"] = lib_util.most_freq_item(gt_dict["desc"]) if gt_dict["desc"] else ""
    gt_dict["oc"] = lib_util.most_freq_item(gt_dict["oc"]) if gt_dict["oc"] else None
    oc = gt_dict["oc"]
    gt_dict["oconf"] = evaluate_oracle_confidence(oc, gt_dict["footprints"])
    if oc:
        olat, olon = lib_grid_search.lookup_coords(oc)
        gt_dict["olat"] = olat
        gt_dict["olon"] = olon
    else:
        gt_dict["olat"] = None
        gt_dict["olon"] = None
    gt_dict["error"] = err_msg
    assert(not err_msg)
    return (gt_dict, err_msg)
Example #5
0
def predict_by_text(text):
    features = feature_adapter.extract_text_features([text])
    print features
    text_pred = text_decoder.predict(features)
    print text_pred
    gt_dict = dict()
    slr_lat, slr_lon = lib_grid_search.lookup_coords(text_pred)
    gt_dict["liw"] = features
    gt_dict["pc"] = text_pred
    gt_dict["plat"] = slr_lat
    gt_dict["plon"] = slr_lon
    gt_dict["errdist"] = ""
    gt_dict["error"] = None
    gt_dict["text_pred"] = text_pred
    gt_dict["loc_pred"] = ""
    gt_dict["tz_pred"] = ""
    return gt_dict
Example #6
0
def predict_by_text(text):
    features = feature_adapter.extract_text_features([text])
    print features
    text_pred = text_decoder.predict(features)
    print text_pred
    gt_dict = dict()
    slr_lat, slr_lon = lib_grid_search.lookup_coords(text_pred)
    gt_dict["liw"] = features
    gt_dict["pc"] = text_pred
    gt_dict["plat"] = slr_lat
    gt_dict["plon"] = slr_lon
    gt_dict["errdist"] = "";
    gt_dict["error"] = None;
    gt_dict["text_pred"] = text_pred
    gt_dict["loc_pred"] = ""
    gt_dict["tz_pred"] = ""
    return gt_dict
Example #7
0
def geolocate(data, enable_cache=True):
    """
    Input:
        1. user screen name
        2. user timeline JSON data (assuming the same user name)
    Ouptut:
        GT-dict
    Supported Options: caching;
    """
    # identify input format
    sname = None  #user screen name
    gt_dict = None  #JSON format result
    err_msg = None
    parsed_data, err_msg = parse_input(data)

    if isinstance(parsed_data, basestring):
        sname = parsed_data
    elif isinstance(parsed_data, dict):
        gt_dict = parsed_data
        sname = parsed_data["sname"]
    else:
        gt_dict = {"error": err_msg}
        err_msg = "{0} {1}".format(err_msg, data)
        lib_log.error(err_msg)
        return gt_dict

    sname = sname.lower()
    print "Predict:", sname

    # using cache?
    if enable_cache:
        cached = seek_cache(sname)
        if cached:
            return cached

    # crawl data if input is user name
    if not gt_dict:
        gt_dict, err_msg = twitter_adapter.parse_user_timeline(sname)
        if err_msg:
            gt_dict = {"error": err_msg}
            err_msg = "{0} {1}".format(err_msg, sname)
            lib_log.error(err_msg)
            return gt_dict

    # sequential classifier
    text_pred = text_decoder.predict(
        feature_adapter.extract_text_features(gt_dict["tweets"]))
    loc_pred = loc_decoder.predict(
        feature_adapter.extract_ngram_features(gt_dict["loc"]))
    tz_pred = tz_decoder.predict(
        feature_adapter.extract_ngram_features(gt_dict["tz"]))
    #desc_pred = desc_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["desc"]))
    #rname_pred = rname_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["rname"]))
    print "L0 predictions:", text_pred, loc_pred, tz_pred
    #print text_pred, loc_pred, tz_pred, desc_pred, rname_pred

    text_id = city_adapter.get_id_by_city(text_pred) + fea_num * 0
    loc_id = city_adapter.get_id_by_city(loc_pred) + fea_num * 1
    tz_id = city_adapter.get_id_by_city(tz_pred) + fea_num * 2
    #desc_id = city_adapter.get_id_by_city(desc_pred) + fea_num * 3
    #rname_id = city_adapter.get_id_by_city(rname_pred) + fea_num * 4
    #libsvm_rec = "{0}:1 {1}:1 {2}:1\n".format(text_id, loc_id, tz_id)
    #libsvm_rec = "{0}:1 {1}:1 {2}:1 {3}:1 {4}:1\n".format(text_id, loc_id, tz_id, desc_id, rname_id)
    #print libsvm_rec
    fea_vec_liblinear = [{text_id: 1, loc_id: 1, tz_id: 1}]
    #fea_vec_liblinear = [{text_id:1, loc_id:1, tz_id:1, desc_id:1, rname_id:1}]
    print "L1 data:", fea_vec_liblinear
    p_label, p_acc, p_val = liblinearutil.predict([1], fea_vec_liblinear,
                                                  stacked_model)
    slr_id = int(p_label[0])  # stacked logistic regression prediction label
    slr_pred = city_adapter.get_city_by_id(slr_id)
    print "L1 prediction:", slr_pred
    gt_dict["pc"] = slr_pred
    slr_lat, slr_lon = lib_grid_search.lookup_coords(slr_pred)
    print "L1 prediction coordinates:", slr_lat, slr_lon
    gt_dict["plat"] = slr_lat
    gt_dict["plon"] = slr_lon
    gt_dict["errdist"] = int(
        gcd_dist.calc_dist_degree(slr_lat, slr_lon, gt_dict["olat"],
                                  gt_dict["olon"])) if gt_dict["oc"] else None
    assert (not err_msg)
    gt_dict["error"] = err_msg
    gt_dict["text_pred"] = text_pred
    gt_dict["loc_pred"] = loc_pred
    gt_dict["tz_pred"] = tz_pred

    if enable_cache:
        open("{0}/cache/{1}".format(pkg_path, sname),
             "w").write("{0}\n".format(json.dumps(gt_dict)))

    return gt_dict
Example #8
0
def geolocate(data, enable_cache = True):
    """
    Input:
        1. user screen name
        2. user timeline JSON data (assuming the same user name)
    Ouptut:
        GT-dict
    """
    # identify input format
    sname = None #user screen name
    gt_dict = None #JSON format result
    err_msg = None
    parsed_data, err_msg = parse_input(data)

    if isinstance(parsed_data, basestring):
        sname = parsed_data
    elif isinstance(parsed_data, dict):
        gt_dict = parsed_data
        sname = parsed_data["sname"]
    else:
        gt_dict = {"error":err_msg}
        err_msg = "{0} {1}".format(err_msg, data)
        lib_log.error(err_msg)
        return gt_dict
    
    sname = sname.lower()
    print "Predict:", sname

    # crawl data if input is user name
    if not gt_dict:
        gt_dict, err_msg = twitter_adapter.parse_user_timeline(sname)
        if err_msg:
            gt_dict = {"error":err_msg}
            err_msg = "{0} {1}".format(err_msg, sname)
            lib_log.error(err_msg)
            return gt_dict
    

    # sequential classifier
    text_features = feature_adapter.extract_text_features(gt_dict["tweets"])
    text_pred = text_decoder.predict(text_features)
    loc_pred = loc_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["loc"]))
    tz_pred = tz_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["tz"]))
    print "L0 predictions:", text_pred, loc_pred, tz_pred
    #print text_pred, loc_pred, tz_pred, desc_pred, rname_pred

    text_id = city_adapter.get_id_by_city(text_pred) + fea_num * 0
    loc_id = city_adapter.get_id_by_city(loc_pred) + fea_num * 1
    tz_id = city_adapter.get_id_by_city(tz_pred) + fea_num * 2
    fea_vec_liblinear = [{text_id:1, loc_id:1, tz_id:1}]

    print "L1 data:", fea_vec_liblinear 
    p_label, p_acc, p_val = liblinearutil.predict([1], fea_vec_liblinear, stacked_model)
    slr_id = int(p_label[0]) # stacked logistic regression prediction label
    slr_pred = city_adapter.get_city_by_id(slr_id)
    print "L1 prediction:", slr_pred
    gt_dict["pc"] = slr_pred
    slr_lat, slr_lon = lib_grid_search.lookup_coords(slr_pred)

    print "L1 prediction coordinates:", slr_lat, slr_lon
    gt_dict["liw"] = text_features
    gt_dict["plat"] = slr_lat
    gt_dict["plon"] = slr_lon
    gt_dict["errdist"] = int(gcd_dist.calc_dist_degree(slr_lat, slr_lon, gt_dict["olat"], gt_dict["olon"])) if gt_dict["oc"] else None
    assert(not err_msg)
    gt_dict["error"] = err_msg
    gt_dict["text_pred"] = text_pred
    gt_dict["loc_pred"] = loc_pred
    gt_dict["tz_pred"] = tz_pred
    if enable_cache:
        open("{0}/cache/{1}".format(pkg_path, sname), "w").write("{0}\n".format(json.dumps(gt_dict)))
    return gt_dict