Ejemplo n.º 1
0
def parse_input(data):
    """    Parse input string    """
    err_msg = None
    try:
        assert (isinstance(data, basestring))
        data = json.loads(data.strip())
    except AssertionError:
        err_msg = "Invalid input data format"
        #err_msg = "Invalid input data format: {0}".format(data)
        return (None, err_msg)
    except ValueError:
        try:
            assert (len(data) <= 15)  # maiximum twitter screen name length
        except AssertionError:
            err_msg = "User screen name exceeds 15 characters"
            #err_msg = "User screen name exceeds 15 characters {0}".format(data)
            return (None, err_msg)
        else:
            return (data, err_msg)  # return screen name
    else:
        gt_dict, err_msg = twitter_adapter.parse_user_timeline(
            data)  # return parsed user timeline data
        if err_msg:
            #err_msg = "{0} {1}".format(err_msg, data)
            return (None, err_msg)
        return (gt_dict, err_msg)
Ejemplo n.º 2
0
def parse_input(data):
    """    Parse input string    """
    err_msg = None
    try:
        assert(isinstance(data, basestring))
        data = json.loads(data.strip()) 
    except AssertionError:
        err_msg = "Invalid input data format"
        #err_msg = "Invalid input data format: {0}".format(data)
        return (None, err_msg)
    except ValueError:
        try:
            assert(len(data) <= 15) # maiximum twitter screen name length
        except AssertionError:
            err_msg = "User screen name exceeds 15 characters"
            #err_msg = "User screen name exceeds 15 characters {0}".format(data)
            return (None, err_msg)
        else:
            return (data, err_msg) # return screen name
    else:
        gt_dict, err_msg = twitter_adapter.parse_user_timeline(data) # return parsed user timeline data
        if err_msg:
            #err_msg = "{0} {1}".format(err_msg, data)
            return (None, err_msg)
        return (gt_dict, err_msg)
Ejemplo n.º 3
0
def geolocate(data, enable_cache=True):
    """
    Input:
        1. user screen name
        2. user timeline JSON data (assuming the same user name)
    Ouptut:
        GT-dict
    Supported Options: caching;
    """
    # identify input format
    sname = None  #user screen name
    gt_dict = None  #JSON format result
    err_msg = None
    parsed_data, err_msg = parse_input(data)

    if isinstance(parsed_data, basestring):
        sname = parsed_data
    elif isinstance(parsed_data, dict):
        gt_dict = parsed_data
        sname = parsed_data["sname"]
    else:
        gt_dict = {"error": err_msg}
        err_msg = "{0} {1}".format(err_msg, data)
        lib_log.error(err_msg)
        return gt_dict

    sname = sname.lower()
    print "Predict:", sname

    # using cache?
    if enable_cache:
        cached = seek_cache(sname)
        if cached:
            return cached

    # crawl data if input is user name
    if not gt_dict:
        gt_dict, err_msg = twitter_adapter.parse_user_timeline(sname)
        if err_msg:
            gt_dict = {"error": err_msg}
            err_msg = "{0} {1}".format(err_msg, sname)
            lib_log.error(err_msg)
            return gt_dict

    # sequential classifier
    text_pred = text_decoder.predict(
        feature_adapter.extract_text_features(gt_dict["tweets"]))
    loc_pred = loc_decoder.predict(
        feature_adapter.extract_ngram_features(gt_dict["loc"]))
    tz_pred = tz_decoder.predict(
        feature_adapter.extract_ngram_features(gt_dict["tz"]))
    #desc_pred = desc_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["desc"]))
    #rname_pred = rname_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["rname"]))
    print "L0 predictions:", text_pred, loc_pred, tz_pred
    #print text_pred, loc_pred, tz_pred, desc_pred, rname_pred

    text_id = city_adapter.get_id_by_city(text_pred) + fea_num * 0
    loc_id = city_adapter.get_id_by_city(loc_pred) + fea_num * 1
    tz_id = city_adapter.get_id_by_city(tz_pred) + fea_num * 2
    #desc_id = city_adapter.get_id_by_city(desc_pred) + fea_num * 3
    #rname_id = city_adapter.get_id_by_city(rname_pred) + fea_num * 4
    #libsvm_rec = "{0}:1 {1}:1 {2}:1\n".format(text_id, loc_id, tz_id)
    #libsvm_rec = "{0}:1 {1}:1 {2}:1 {3}:1 {4}:1\n".format(text_id, loc_id, tz_id, desc_id, rname_id)
    #print libsvm_rec
    fea_vec_liblinear = [{text_id: 1, loc_id: 1, tz_id: 1}]
    #fea_vec_liblinear = [{text_id:1, loc_id:1, tz_id:1, desc_id:1, rname_id:1}]
    print "L1 data:", fea_vec_liblinear
    p_label, p_acc, p_val = liblinearutil.predict([1], fea_vec_liblinear,
                                                  stacked_model)
    slr_id = int(p_label[0])  # stacked logistic regression prediction label
    slr_pred = city_adapter.get_city_by_id(slr_id)
    print "L1 prediction:", slr_pred
    gt_dict["pc"] = slr_pred
    slr_lat, slr_lon = lib_grid_search.lookup_coords(slr_pred)
    print "L1 prediction coordinates:", slr_lat, slr_lon
    gt_dict["plat"] = slr_lat
    gt_dict["plon"] = slr_lon
    gt_dict["errdist"] = int(
        gcd_dist.calc_dist_degree(slr_lat, slr_lon, gt_dict["olat"],
                                  gt_dict["olon"])) if gt_dict["oc"] else None
    assert (not err_msg)
    gt_dict["error"] = err_msg
    gt_dict["text_pred"] = text_pred
    gt_dict["loc_pred"] = loc_pred
    gt_dict["tz_pred"] = tz_pred

    if enable_cache:
        open("{0}/cache/{1}".format(pkg_path, sname),
             "w").write("{0}\n".format(json.dumps(gt_dict)))

    return gt_dict
Ejemplo n.º 4
0
def geolocate(data, enable_cache = True):
    """
    Input:
        1. user screen name
        2. user timeline JSON data (assuming the same user name)
    Ouptut:
        GT-dict
    """
    # identify input format
    sname = None #user screen name
    gt_dict = None #JSON format result
    err_msg = None
    parsed_data, err_msg = parse_input(data)

    if isinstance(parsed_data, basestring):
        sname = parsed_data
    elif isinstance(parsed_data, dict):
        gt_dict = parsed_data
        sname = parsed_data["sname"]
    else:
        gt_dict = {"error":err_msg}
        err_msg = "{0} {1}".format(err_msg, data)
        lib_log.error(err_msg)
        return gt_dict
    
    sname = sname.lower()
    print "Predict:", sname

    # crawl data if input is user name
    if not gt_dict:
        gt_dict, err_msg = twitter_adapter.parse_user_timeline(sname)
        if err_msg:
            gt_dict = {"error":err_msg}
            err_msg = "{0} {1}".format(err_msg, sname)
            lib_log.error(err_msg)
            return gt_dict
    

    # sequential classifier
    text_features = feature_adapter.extract_text_features(gt_dict["tweets"])
    text_pred = text_decoder.predict(text_features)
    loc_pred = loc_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["loc"]))
    tz_pred = tz_decoder.predict(feature_adapter.extract_ngram_features(gt_dict["tz"]))
    print "L0 predictions:", text_pred, loc_pred, tz_pred
    #print text_pred, loc_pred, tz_pred, desc_pred, rname_pred

    text_id = city_adapter.get_id_by_city(text_pred) + fea_num * 0
    loc_id = city_adapter.get_id_by_city(loc_pred) + fea_num * 1
    tz_id = city_adapter.get_id_by_city(tz_pred) + fea_num * 2
    fea_vec_liblinear = [{text_id:1, loc_id:1, tz_id:1}]

    print "L1 data:", fea_vec_liblinear 
    p_label, p_acc, p_val = liblinearutil.predict([1], fea_vec_liblinear, stacked_model)
    slr_id = int(p_label[0]) # stacked logistic regression prediction label
    slr_pred = city_adapter.get_city_by_id(slr_id)
    print "L1 prediction:", slr_pred
    gt_dict["pc"] = slr_pred
    slr_lat, slr_lon = lib_grid_search.lookup_coords(slr_pred)

    print "L1 prediction coordinates:", slr_lat, slr_lon
    gt_dict["liw"] = text_features
    gt_dict["plat"] = slr_lat
    gt_dict["plon"] = slr_lon
    gt_dict["errdist"] = int(gcd_dist.calc_dist_degree(slr_lat, slr_lon, gt_dict["olat"], gt_dict["olon"])) if gt_dict["oc"] else None
    assert(not err_msg)
    gt_dict["error"] = err_msg
    gt_dict["text_pred"] = text_pred
    gt_dict["loc_pred"] = loc_pred
    gt_dict["tz_pred"] = tz_pred
    if enable_cache:
        open("{0}/cache/{1}".format(pkg_path, sname), "w").write("{0}\n".format(json.dumps(gt_dict)))
    return gt_dict