def evaluate(): fw = open("../result/eval.results", "w") flag = True counter = 0 for df in data_files: counter += 1 df = os.path.join(data_dir, df) jstr = open(df).read() res_dict = proxy.geolocate_cli(jstr, True) err = res_dict["error"] if err: print err continue sname = res_dict["sname"].lower() oconf = res_dict["oconf"] # here to ensure 10 geotagged tweets, and 50% hurdle if oconf != 2: continue pc = res_dict["pc"] oc = res_dict["oc"] plat, plon = lib_grid_search.lookup_coords(pc) olat, olon = lib_grid_search.lookup_coords(oc) error = gcd_dist.calc_dist_degree(plat, plon, olat, olon) error_list.append(error) if oc[-2:] == pc[-2:]: cc_list.append(True) text_pred = res_dict["text_pred"] loc_pred = res_dict["loc_pred"] tz_pred = res_dict["tz_pred"] tweet_num = len(res_dict["tweets"]) rec = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format( sname, oc, pc, text_pred, loc_pred, tz_pred, error, tweet_num) print counter, rec, fw.write(rec) divisor = float(len(error_list)) acc = len([error for error in error_list if error == 0.0]) / divisor cacc = sum(1.0 for c in cc_list if c) / divisor acc161 = len([error for error in error_list if error <= 161.0]) / divisor mean = sum(error_list) / divisor median = sorted(error_list)[int(divisor) / 2] fw.write("Acc: {0}\n".format(acc)) print "Acc: {0}\n".format(acc), fw.write("Acc161: {0}\n".format(acc161)) print "Acc161: {0}\n".format(acc161), fw.write("AccCountry: {0}\n".format(cacc)) print "AccCountry: {0}\n".format(cacc), fw.write("Median: {0}\n".format(median)) print "Median: {0}\n".format(median), fw.write("Mean: {0}\n".format(mean)) print "Mean: {0}\n".format(mean), fw.close() print "All finished"
def evaluate(): fw = open("../result/eval.results", "w") flag = True counter = 0 for df in data_files: counter += 1 df = os.path.join(data_dir, df) jstr = open(df).read() res_dict = proxy.geolocate_cli(jstr, True) err = res_dict["error"] if err: print err continue sname = res_dict["sname"].lower() oconf = res_dict["oconf"] # here to ensure 10 geotagged tweets, and 50% hurdle if oconf != 2: continue pc = res_dict["pc"] oc = res_dict["oc"] plat, plon = lib_grid_search.lookup_coords(pc) olat, olon = lib_grid_search.lookup_coords(oc) error = gcd_dist.calc_dist_degree(plat, plon, olat, olon) error_list.append(error) if oc[-2:] == pc[-2:]: cc_list.append(True) text_pred = res_dict["text_pred"] loc_pred = res_dict["loc_pred"] tz_pred = res_dict["tz_pred"] tweet_num = len(res_dict["tweets"]) rec = "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format(sname, oc, pc, text_pred, loc_pred, tz_pred, error, tweet_num) print counter, rec, fw.write(rec) divisor = float(len(error_list)) acc = len([error for error in error_list if error == 0.0]) / divisor cacc = sum(1.0 for c in cc_list if c) / divisor acc161 = len([error for error in error_list if error <= 161.0]) / divisor mean = sum(error_list) / divisor median = sorted(error_list)[int(divisor)/2] fw.write("Acc: {0}\n".format(acc)) print "Acc: {0}\n".format(acc), fw.write("Acc161: {0}\n".format(acc161)) print "Acc161: {0}\n".format(acc161), fw.write("AccCountry: {0}\n".format(cacc)) print "AccCountry: {0}\n".format(cacc), fw.write("Median: {0}\n".format(median)) print "Median: {0}\n".format(median), fw.write("Mean: {0}\n".format(mean)) print "Mean: {0}\n".format(mean), fw.close() print "All finished"
def distill_data(statuses, distill_func): err_msg = None gt_dict = dict() gt_dict["oc"] = [] #short for oracle city gt_dict["footprints"] = [] gt_dict["loc"] = [] gt_dict["tz"] = [] gt_dict["desc"] = [] gt_dict["tweets"] = [] one_off_flag = True for status in statuses: jobj, err_msg = distill_func(status) if err_msg and not jobj: return (None, err_msg) text = anonymise_text(jobj["text"]) loc = jobj["loc"] tz = jobj["tz"] desc = jobj["desc"] lat = jobj["lat"] lon = jobj["lon"] city = None if lat and lon: city = lib_grid_search.lookup_city(lat, lon) if city: gt_dict["oc"].append(city) gt_dict["footprints"].append((city, lat, lon, text)) if loc: gt_dict["loc"].append(loc) if tz: gt_dict["tz"].append(tz) if desc: gt_dict["desc"].append(desc) if one_off_flag: gt_dict["rname"] = jobj["rname"] gt_dict["sname"] = jobj["sname"] one_off_flag = False gt_dict["tweets"].append(text) gt_dict["loc"] = lib_util.most_freq_item( gt_dict["loc"]) if gt_dict["loc"] else "" gt_dict["tz"] = lib_util.most_freq_item( gt_dict["tz"]) if gt_dict["tz"] else "" gt_dict["desc"] = lib_util.most_freq_item( gt_dict["desc"]) if gt_dict["desc"] else "" gt_dict["oc"] = lib_util.most_freq_item( gt_dict["oc"]) if gt_dict["oc"] else None oc = gt_dict["oc"] gt_dict["oconf"] = evaluate_oracle_confidence(oc, gt_dict["footprints"]) if oc: olat, olon = lib_grid_search.lookup_coords(oc) gt_dict["olat"] = olat gt_dict["olon"] = olon else: gt_dict["olat"] = None gt_dict["olon"] = None gt_dict["error"] = err_msg assert (not err_msg) return (gt_dict, err_msg)
def distill_data(statuses, distill_func): err_msg = None gt_dict = dict() gt_dict["oc"] = [] #short for oracle city gt_dict["footprints"] = [] gt_dict["loc"] = [] gt_dict["tz"] = [] gt_dict["desc"] = [] gt_dict["tweets"] = [] one_off_flag = True for status in statuses: jobj, err_msg = distill_func(status) if err_msg and not jobj: return (None, err_msg) text = anonymise_text(jobj["text"]) loc = jobj["loc"] tz = jobj["tz"] desc = jobj["desc"] lat = jobj["lat"] lon = jobj["lon"] city = None if lat and lon: city = lib_grid_search.lookup_city(lat, lon) if city: gt_dict["oc"].append(city) gt_dict["footprints"].append((city, lat, lon, text)) if loc: gt_dict["loc"].append(loc) if tz: gt_dict["tz"].append(tz) if desc: gt_dict["desc"].append(desc) if one_off_flag: gt_dict["rname"] = jobj["rname"] gt_dict["sname"] = jobj["sname"] one_off_flag = False gt_dict["tweets"].append(text) gt_dict["loc"] = lib_util.most_freq_item(gt_dict["loc"]) if gt_dict["loc"] else "" gt_dict["tz"] = lib_util.most_freq_item(gt_dict["tz"]) if gt_dict["tz"] else "" gt_dict["desc"] = lib_util.most_freq_item(gt_dict["desc"]) if gt_dict["desc"] else "" gt_dict["oc"] = lib_util.most_freq_item(gt_dict["oc"]) if gt_dict["oc"] else None oc = gt_dict["oc"] gt_dict["oconf"] = evaluate_oracle_confidence(oc, gt_dict["footprints"]) if oc: olat, olon = lib_grid_search.lookup_coords(oc) gt_dict["olat"] = olat gt_dict["olon"] = olon else: gt_dict["olat"] = None gt_dict["olon"] = None gt_dict["error"] = err_msg assert(not err_msg) return (gt_dict, err_msg)
def predict_by_text(text): features = feature_adapter.extract_text_features([text]) print features text_pred = text_decoder.predict(features) print text_pred gt_dict = dict() slr_lat, slr_lon = lib_grid_search.lookup_coords(text_pred) gt_dict["liw"] = features gt_dict["pc"] = text_pred gt_dict["plat"] = slr_lat gt_dict["plon"] = slr_lon gt_dict["errdist"] = "" gt_dict["error"] = None gt_dict["text_pred"] = text_pred gt_dict["loc_pred"] = "" gt_dict["tz_pred"] = "" return gt_dict
def predict_by_text(text): features = feature_adapter.extract_text_features([text]) print features text_pred = text_decoder.predict(features) print text_pred gt_dict = dict() slr_lat, slr_lon = lib_grid_search.lookup_coords(text_pred) gt_dict["liw"] = features gt_dict["pc"] = text_pred gt_dict["plat"] = slr_lat gt_dict["plon"] = slr_lon gt_dict["errdist"] = ""; gt_dict["error"] = None; gt_dict["text_pred"] = text_pred gt_dict["loc_pred"] = "" gt_dict["tz_pred"] = "" return gt_dict
def geolocate(data, enable_cache=True): """ Input: 1. user screen name 2. user timeline JSON data (assuming the same user name) Ouptut: GT-dict Supported Options: caching; """ # identify input format sname = None #user screen name gt_dict = None #JSON format result err_msg = None parsed_data, err_msg = parse_input(data) if isinstance(parsed_data, basestring): sname = parsed_data elif isinstance(parsed_data, dict): gt_dict = parsed_data sname = parsed_data["sname"] else: gt_dict = {"error": err_msg} err_msg = "{0} {1}".format(err_msg, data) lib_log.error(err_msg) return gt_dict sname = sname.lower() print "Predict:", sname # using cache? if enable_cache: cached = seek_cache(sname) if cached: return cached # crawl data if input is user name if not gt_dict: gt_dict, err_msg = twitter_adapter.parse_user_timeline(sname) if err_msg: gt_dict = {"error": err_msg} err_msg = "{0} {1}".format(err_msg, sname) lib_log.error(err_msg) return gt_dict # sequential classifier text_pred = text_decoder.predict( feature_adapter.extract_text_features(gt_dict["tweets"])) loc_pred = loc_decoder.predict( feature_adapter.extract_ngram_features(gt_dict["loc"])) tz_pred = tz_decoder.predict( feature_adapter.extract_ngram_features(gt_dict["tz"])) #desc_pred = desc_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["desc"])) #rname_pred = rname_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["rname"])) print "L0 predictions:", text_pred, loc_pred, tz_pred #print text_pred, loc_pred, tz_pred, desc_pred, rname_pred text_id = city_adapter.get_id_by_city(text_pred) + fea_num * 0 loc_id = city_adapter.get_id_by_city(loc_pred) + fea_num * 1 tz_id = city_adapter.get_id_by_city(tz_pred) + fea_num * 2 #desc_id = city_adapter.get_id_by_city(desc_pred) + fea_num * 3 #rname_id = city_adapter.get_id_by_city(rname_pred) + fea_num * 4 #libsvm_rec = "{0}:1 {1}:1 {2}:1\n".format(text_id, loc_id, tz_id) #libsvm_rec = "{0}:1 {1}:1 {2}:1 {3}:1 {4}:1\n".format(text_id, loc_id, tz_id, desc_id, rname_id) #print libsvm_rec fea_vec_liblinear = [{text_id: 1, loc_id: 1, tz_id: 1}] #fea_vec_liblinear = [{text_id:1, loc_id:1, tz_id:1, desc_id:1, rname_id:1}] print "L1 data:", fea_vec_liblinear p_label, p_acc, p_val = liblinearutil.predict([1], fea_vec_liblinear, stacked_model) slr_id = int(p_label[0]) # stacked logistic regression prediction label slr_pred = city_adapter.get_city_by_id(slr_id) print "L1 prediction:", slr_pred gt_dict["pc"] = slr_pred slr_lat, slr_lon = lib_grid_search.lookup_coords(slr_pred) print "L1 prediction coordinates:", slr_lat, slr_lon gt_dict["plat"] = slr_lat gt_dict["plon"] = slr_lon gt_dict["errdist"] = int( gcd_dist.calc_dist_degree(slr_lat, slr_lon, gt_dict["olat"], gt_dict["olon"])) if gt_dict["oc"] else None assert (not err_msg) gt_dict["error"] = err_msg gt_dict["text_pred"] = text_pred gt_dict["loc_pred"] = loc_pred gt_dict["tz_pred"] = tz_pred if enable_cache: open("{0}/cache/{1}".format(pkg_path, sname), "w").write("{0}\n".format(json.dumps(gt_dict))) return gt_dict
def geolocate(data, enable_cache = True): """ Input: 1. user screen name 2. user timeline JSON data (assuming the same user name) Ouptut: GT-dict """ # identify input format sname = None #user screen name gt_dict = None #JSON format result err_msg = None parsed_data, err_msg = parse_input(data) if isinstance(parsed_data, basestring): sname = parsed_data elif isinstance(parsed_data, dict): gt_dict = parsed_data sname = parsed_data["sname"] else: gt_dict = {"error":err_msg} err_msg = "{0} {1}".format(err_msg, data) lib_log.error(err_msg) return gt_dict sname = sname.lower() print "Predict:", sname # crawl data if input is user name if not gt_dict: gt_dict, err_msg = twitter_adapter.parse_user_timeline(sname) if err_msg: gt_dict = {"error":err_msg} err_msg = "{0} {1}".format(err_msg, sname) lib_log.error(err_msg) return gt_dict # sequential classifier text_features = feature_adapter.extract_text_features(gt_dict["tweets"]) text_pred = text_decoder.predict(text_features) loc_pred = loc_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["loc"])) tz_pred = tz_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["tz"])) print "L0 predictions:", text_pred, loc_pred, tz_pred #print text_pred, loc_pred, tz_pred, desc_pred, rname_pred text_id = city_adapter.get_id_by_city(text_pred) + fea_num * 0 loc_id = city_adapter.get_id_by_city(loc_pred) + fea_num * 1 tz_id = city_adapter.get_id_by_city(tz_pred) + fea_num * 2 fea_vec_liblinear = [{text_id:1, loc_id:1, tz_id:1}] print "L1 data:", fea_vec_liblinear p_label, p_acc, p_val = liblinearutil.predict([1], fea_vec_liblinear, stacked_model) slr_id = int(p_label[0]) # stacked logistic regression prediction label slr_pred = city_adapter.get_city_by_id(slr_id) print "L1 prediction:", slr_pred gt_dict["pc"] = slr_pred slr_lat, slr_lon = lib_grid_search.lookup_coords(slr_pred) print "L1 prediction coordinates:", slr_lat, slr_lon gt_dict["liw"] = text_features gt_dict["plat"] = slr_lat gt_dict["plon"] = slr_lon gt_dict["errdist"] = int(gcd_dist.calc_dist_degree(slr_lat, slr_lon, gt_dict["olat"], gt_dict["olon"])) if gt_dict["oc"] else None assert(not err_msg) gt_dict["error"] = err_msg gt_dict["text_pred"] = text_pred gt_dict["loc_pred"] = loc_pred gt_dict["tz_pred"] = tz_pred if enable_cache: open("{0}/cache/{1}".format(pkg_path, sname), "w").write("{0}\n".format(json.dumps(gt_dict))) return gt_dict