コード例 #1
0
ファイル: evaluation_old.py プロジェクト: vmishra04/LNEx
def init_using_elasticindex(bb):

    lnex.elasticindex(conn_string='130.108.85.186:9200',
                      index_name="photon_v1")
    #lnex.elasticindex(conn_string='localhost:9201', index_name="photon")

    return lnex.initialize(bb, augment=True)
コード例 #2
0
def init_using_elasticindex(gaz_name):

    lnex.elasticindex(conn_string='173.193.79.31:31169', index_name="photon")
    if gaz_name == "chennai":
        # chennai flood bounding box
        bb = [12.74, 80.066986084, 13.2823848224, 80.3464508057]
    elif gaz_name == "houston":
        bb = [29.4778611958, -95.975189209, 30.1463147381, -94.8889160156]
    print(bb)
    return lnex.initialize(bb, augment=True)
コード例 #3
0
def init_using_elasticindex(bb, cache, augmentType, dataset,
                            capital_word_shape):
    lnex.elasticindex(conn_string='localhost:9200', index_name="photon")

    geo_info = lnex.initialize(bb,
                               augmentType=augmentType,
                               cache=cache,
                               dataset_name=dataset,
                               capital_word_shape=capital_word_shape)
    return geo_info
コード例 #4
0
ファイル: evaluation_old.py プロジェクト: vmishra04/LNEx
def init_using_files():

    data_folder = os.path.join("..", "_Data")

    with open(data_folder + "/chennai_geo_locations.json") as f:
        geo_locations = json.load(f)

    with open(data_folder + "/chennai_geo_info.json") as f:
        geo_info = json.load(f)

    with open(data_folder + "/chennai_extended_words3.json") as f:
        extended_words3 = json.load(f)

    lnex.initialize_using_files(geo_locations, extended_words3)

    return geo_info
コード例 #5
0
ファイル: LNEx.py プロジェクト: sachinvarriar/ditk
    def train(cls, train_dev_set):
        '''
        This method will prepare gazetteer as it as a statistical model no training is required
        
        :param train_set: set to null 
        :return: t gazetteer initialized
        '''
        bbs = {"chennai": [12.74, 80.066986084, 13.2823848224, 80.3464508057]}

        dataset = "chennai"
        lnex.elasticindex(conn_string='localhost:9200', index_name="photon")
        geo_info = lnex.initialize(bbs[dataset],
                                   augmentType="HP",
                                   cache=False,
                                   dataset_name=dataset,
                                   capital_word_shape=False)
        return geo_info
コード例 #6
0
def init_using_files(dataset, capital_word_shape):

    with open("_Data/Cached_Gazetteers/" + dataset +
              "_geo_locations.json") as f:
        geo_locations = json.load(f)

    with open("_Data/Cached_Gazetteers/" + dataset +
              "_extended_words3.json") as f:
        extended_words3 = json.load(f)

    with open("_Data/Cached_Gazetteers/" + dataset + "_geo_info.json") as f:
        geo_info = json.load(f)

    lnex.initialize_using_files(geo_locations,
                                extended_words3,
                                capital_word_shape=capital_word_shape)

    return geo_info
コード例 #7
0
ファイル: LNEx.py プロジェクト: sachinvarriar/ditk
    def predict(cls, model, test_set):
        '''
        The method extracts location based on the statistical model and outputs the following results
        
        :param model: a trained model
        :param test_set: a list of test data
        :return: returns a list of the following 4 items list:
            tweet_mention, mention_offsets, geo_location, geo_info_id
            tweet_mention:   is the location mention in the tweet
                             (substring retrieved from the mention offsets)
            mention_offsets: a tuple of the start and end offsets of the LN
            geo_location:    the matched location name from the gazetteer.
                             e.g., new avadi rd > New Avadi Road
            geo_info_id:  s   contains the attached metadata of all the matched
                             location names from the gazetteer
        '''
        output_list = []
        for tweet in test_set:
            for output in lnex.extract(tweet):
                print(output[0], output[1], output[2], output[3]["main"])
                temp = [output[0], output[1], output[2], output[3]["main"]]
                output_list.append(temp)
            print("#" * 50)

        ditk_path = ""
        for path in sys.path:
            if "ditk" in path:
                ditk_path = path

        output_file = ditk_path + "/entity_linkage/normalization/lnex/result/output.txt"
        if not os.path.exists(os.path.dirname(output_file)):
            os.makedirs(os.path.dirname(output_file), exist_ok=True)

        with open(output_file, "w") as f:
            for tweet in test_set:
                for output in lnex.extract(tweet):
                    f.write(
                        str(output[0]) + ", " + str(output[1]) + ", " +
                        str(output[2]) + ", " + str(output[3]["main"]) + "\n")

        return output_list
コード例 #8
0
ファイル: LNEx.py プロジェクト: sachinvarriar/ditk
    def evaluate(cls, clf, eval_set):
        '''
        :param model: a trained model
        :param eval_set: a list of validation data
        :return: (precision, recall, f1 score)
        '''
        bbs = {"chennai": [12.74, 80.066986084, 13.2823848224, 80.3464508057]}

        results = dict()
        dataset = "chennai"

        print(dataset)
        lnex.initialize(bbs[dataset],
                        augmentType="FULL",
                        cache=False,
                        dataset_name=dataset,
                        capital_word_shape=False)

        anns = eval_set

        results = eval_main.evaluate(anns)

        return results
コード例 #9
0
def prepare_geo_points(gaz_name, geo_info):
    os.environ['NO_PROXY'] = '127.0.0.1'
    all_geo_points = list()
    es = Elasticsearch([{'host': '173.193.79.31', 'port': 31169}])
    for tweet in get_all_tweets_and_annotations(gaz_name):
        classes = natural_language_classifier.classify('6876e8x557-nlc-635',
                                                       tweet[0])
        r = classes['top_class']
        # r="shelter_matching"
        if r == "shelter_matching":
            cl = "shelter_matching"
            i = '/static/shelter.png'
        elif r == "infrastructure_need":
            cl = "infrastructure_need"
            i = '/static/utility_infrastructure'
        elif r == "rescue_match":
            cl = "rescue_match"
            i = '/static/medical_need.png'
        else:
            cl = "not_related_or_irrelevant"
            i = ''
        for ln in lnex.extract(tweet[0]):
            if ln[0].lower() == gaz_name.lower():
                continue
            ln_offsets = ln[1]
            geoinfo = [geo_info[x] for x in ln[3]]
            if len(geoinfo) == 0:
                continue
            for geopoint in geoinfo:
                lat = geopoint["geo_item"]["point"]["lat"]
                lon = geopoint["geo_item"]["point"]["lon"]
                try:
                    fl = flooded(lat, lon)
                    # print str(fl)
                    if str(fl) == 'True':
                        fld = True
                    else:
                        fld = False
                    es.index(index=gaz_name + '-tweetneeds',
                             doc_type='doc',
                             body={
                                 "type": "Feature",
                                 "geometry": {
                                     "type": "Point",
                                     "coordinates": [lon, lat]
                                 },
                                 "properties": {
                                     "locationMention": {
                                         "text": ln[0],
                                         "offsets":
                                         [ln_offsets[0], ln_offsets[1]]
                                     },
                                     "tweetID": tweet[1],
                                     "text": tweet[0],
                                     "createdAt": tweet[2],
                                     "needClass": cl,
                                     "flooded": fld,
                                     "image": tweet[3]
                                 }
                             })
                    all_geo_points.append({
                        "type": "Feature",
                        "geometry": {
                            "type": "Point",
                            "coordinates": [lon, lat]
                        },
                        "properties": {
                            "locationMention": {
                                "text": ln[0],
                                "offsets": [ln_offsets[0], ln_offsets[1]]
                            },
                            "tweetID": tweet[1],
                            "text": tweet[0],
                            "createdAt": tweet[2],
                            "needClass": cl,
                            "flooded": fld,
                            "image": tweet[3]
                        }
                    })
                    # print (all_geo_points)
                except Exception as e:
                    print e
                    continue

    print(len(all_geo_points))
    return {"type": "FeatureCollection", "features": all_geo_points}
コード例 #10
0
def init_using_elasticindex(bb, cache, augmentType, dataset,
                            capital_word_shape):
    lnex.elasticindex(conn_string='localhost:9200', index_name="photon")

    geo_info = lnex.initialize(bb,
                               augmentType=augmentType,
                               cache=cache,
                               dataset_name=dataset,
                               capital_word_shape=capital_word_shape)
    return geo_info


bbs = {
    "chennai": [12.74, 80.066986084, 13.2823848224, 80.3464508057],
    "louisiana": [29.4563, -93.3453, 31.4521, -89.5276],
    "houston": [29.4778611958, -95.975189209, 30.1463147381, -94.8889160156]
}

dataset = "chennai"

geo_info = init_using_elasticindex(bbs[dataset],
                                   cache=False,
                                   augmentType="HP",
                                   dataset=dataset,
                                   capital_word_shape=False)

for tweet in read_tweets():
    for output in lnex.extract(tweet):
        print(output[0], output[1], output[2], output[3]["main"])
    print("#" * 50)
コード例 #11
0
ファイル: data_prepare.py プロジェクト: shrutikar/d-record
def prepare_geo_points(gaz_name, geo_info):
    os.environ['NO_PROXY'] = '127.0.0.1'
    all_geo_points = list()
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    count = 0
    for tweet in get_all_tweets_and_annotations(gaz_name):
        txt = 'http://127.0.0.1:8089/classify?text="' + tweet[0] + '"'

        r = requests.get(txt)
        print r.content
        if r.content == "shelter_matching":
            cl = "shelter_matching"
            i = '/static/shelter.png'
        elif r.content == "infrastructure_need":
            cl = "infrastructure_need"
            i = '/static/utility_infrastructure'
        else:
            cl = "rescue_match"
            i = '/static/medical_need.png'
        for ln in lnex.extract(tweet[0]):
            if ln[0].lower() == gaz_name.lower():
                continue
            ln_offsets = ln[1]
            geoinfo = [geo_info[x] for x in ln[3]]
            if len(geoinfo) == 0:
                continue
            for geopoint in geoinfo:
                lat = geopoint["geo_item"]["point"]["lat"]
                lon = geopoint["geo_item"]["point"]["lon"]
                marked_tweet = tweet[0][:ln_offsets[0]] + "<mark>" + tweet[0][
                    ln_offsets[0]:ln_offsets[1]] + "</mark>" + tweet[0][
                        ln_offsets[1]:]
                try:
                    description = """   <table>
                                            <tr>
                                                <td colspan="2">marked_tweet</td>
                                            </tr>
                                        </table> """
                    description = description.replace("marked_tweet",
                                                      marked_tweet)
                    marker_icon = "marker"
                    fl = flooded(lat, lon)
                    print str(fl)
                    if str(fl) == 'True':
                        fld = 'True'
                    else:
                        fld = 'False'
                    es.index(index=gaz_name + '-tweets',
                             doc_type='people',
                             id=count,
                             body={
                                 "type": "Feature",
                                 "geometry": {
                                     "type": "Point",
                                     "coordinates": [lon, lat]
                                 },
                                 "properties": {
                                     "description": description,
                                     "icon": marker_icon,
                                     "id": tweet[1],
                                     "tweet": tweet[0],
                                     "timestamp": tweet[2],
                                     "class": cl,
                                     "URL": i,
                                     "Flood": fld
                                 }
                             })
                    all_geo_points.append({
                        "type": "Feature",
                        "geometry": {
                            "type": "Point",
                            "coordinates": [lon, lat]
                        },
                        "properties": {
                            "description": description,
                            "icon": marker_icon,
                            "id": tweet[1],
                            "tweet": tweet[0],
                            "timestamp": tweet[2],
                            "class": cl,
                            "URL": i,
                            "Flood": fld
                        }
                    })
                    count = count + 1
                except Exception as e:
                    print e
                    print "ERROR:>>> ", marked_tweet
                    exit()
    #es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    #for i in range(len(all_geo_points)):

#es.index(index='some_try', doc_type='people', id=i, body=all_geo_points[i])

    return {"type": "FeatureCollection", "features": all_geo_points}
コード例 #12
0
ファイル: data_prepare.py プロジェクト: shrutikar/d-record
def prepare_crowd_source(gaz_name):
    geo_info = init_using_elasticindex(gaz_name)
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    data = xlrd.open_workbook("other_resources.xlsx")
    sheet_no = data.sheet_by_index(0)
    x = []
    y = []
    crowd_data = []
    count = 0
    for i in range(sheet_no.nrows):
        if i >= 1:
            x.append(sheet_no.cell(i, 0).value)
            y.append(sheet_no.cell(i, 1).value)
    for text, cls in zip(x, y):
        text = strip_non_ascii(text)
        print text, cls
        for ln in lnex.extract(text):
            if ln[0].lower() == gaz_name.lower():
                continue
            ln_offsets = ln[1]
            geoinfo = [geo_info[x] for x in ln[3]]
            if len(geoinfo) == 0:
                continue
            for geopoint in geoinfo:
                lat = geopoint["geo_item"]["point"]["lat"]
                lon = geopoint["geo_item"]["point"]["lon"]
                marked_tweet = text[:ln_offsets[0]] + "<mark>" + text[
                    ln_offsets[0]:ln_offsets[1]] + "</mark>" + text[
                        ln_offsets[1]:]
                try:
                    description = """   <table>
                                            <tr>
                                                <td colspan="2">marked_tweet</td>
                                            </tr>
                                        </table> """
                    description = description.replace("marked_tweet",
                                                      marked_tweet)
                    marker_icon = "marker"
                    fl = flooded(lat, lon)
                    print str(fl)
                    if str(fl) == 'True':
                        fld = 'True'
                    else:
                        fld = 'False'
                    es.index(index=gaz_name + '-crowd',
                             doc_type='crowd',
                             id=count,
                             body={
                                 "type": "Feature",
                                 "geometry": {
                                     "type": "Point",
                                     "coordinates": [lon, lat]
                                 },
                                 "properties": {
                                     "description": description,
                                     "icon": marker_icon,
                                     "tweet": text,
                                     "class": cls,
                                     "Flood": fld
                                 }
                             })
                    crowd_data.append({
                        "type": "Feature",
                        "geometry": {
                            "type": "Point",
                            "coordinates": [lon, lat]
                        },
                        "properties": {
                            "description": description,
                            "icon": marker_icon,
                            "tweet": text,
                            "class": cls,
                            "Flood": fld
                        }
                    })
                    count = count + 1
                except Exception as e:
                    print e
                    print "ERROR:>>> ", marked_tweet
                    exit()
コード例 #13
0
ファイル: evaluation_old.py プロジェクト: vmishra04/LNEx
            tweet_lns = set()
            lnex_lns = set()
            tweet_text = ""

            for ann in anns[key]:
                if ann != "text":
                    ln = anns[key][ann]

                    tweet_lns.add(((int(ln['start_idx']), int(ln['end_idx'])),
                                   ln['type']))
                else:
                    tweet_text = anns[key][ann]
                    #print tweet_text

                    r = lnex.extract(tweet_text)

                    # how many are already disambiguated +++++++++++++++++++++++
                    for res in r:
                        if len(res[3]) < 2:
                            one_geolocation += 1

                            #if len(res[3]) == 0:
                            #print res[2]
                        else:
                            geo_codes_length_dist[len(res[3])] += 1

                        all_geolocation += 1
                    # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

                    lnex_lns = set([x[1] for x in r])
コード例 #14
0
def evaluate(anns):
    TPs_count = 0
    FPs_count = 0
    FNs_count = 0
    overlaps_count = 0

    #fns = defaultdict(int)

    count = 0
    one_geolocation = 0
    all_geolocation = 0
    geo_codes_length_dist = defaultdict(int)

    FPs_set = defaultdict(set)
    FNs_set = defaultdict(set)

    for key in list(anns.keys()):

        count += 1

        # skip the development set
        #if dataset != "houston" and count < 500:
        #    continue

        tweet_lns = set()
        lnex_lns = set()
        tweet_text = ""

        for ann in anns[key]:
            if ann != "text":
                ln = anns[key][ann]

                tweet_lns.add(
                    ((int(ln['start_idx']), int(ln['end_idx'])), ln['type']))
            else:
                tweet_text = anns[key][ann]

                r = lnex.extract(tweet_text)

                # how many are already disambiguated +++++++++++++++++++++++
                for res in r:
                    if len(res[3]) < 2:
                        one_geolocation += 1

                        #if len(res[3]) == 0:
                        #print res[2]
                    else:
                        geo_codes_length_dist[len(res[3])] += 1

                    all_geolocation += 1
                # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

                lnex_lns = set([x[1] for x in r])

        tweet_lns = set([x[0] for x in tweet_lns if x[1] == "inLoc"])

        # True Positives +++++++++++++++++++++++++++++++++++++++++++++++++++
        TPs = tweet_lns.intersection(lnex_lns)

        TPs_count += len(TPs)

        # Left in both sets ++++++++++++++++++++++++++++++++++++++++++++++++
        tweet_lns -= TPs
        lnex_lns -= TPs

        # Find Overlapping LNs to be counted as 1/2 FPs and 1/2 FNs++
        overlaps = set()
        for x in tweet_lns:
            for y in lnex_lns:
                if do_they_overlap(x, y):
                    overlaps.add(x)
                    overlaps.add(y)

        overlaps_count += len(overlaps)

        # remove the overlapping lns from lnex_lns and tweet_lns
        lnex_lns -= overlaps
        tweet_lns -= overlaps

        # False Positives ++++++++++++++++++++++++++++++++++++++++++++++++++
        # lnex_lns = all - (TPs and overlaps and !inLoc)
        FPs = lnex_lns - tweet_lns
        FPs_count += len(FPs)

        if len(FPs) > 0:
            for x in FPs:
                FPs_set[tweet_text[x[0]:x[1]]].add(
                    (key, tweet_text[x[0] - 2:x[1] + 2], x))

        # False Negatives ++++++++++++++++++++++++++++++++++++++++++++++++++
        FNs = tweet_lns - lnex_lns
        FNs_count += len(FNs)

        if len(FNs) > 0:
            for x in FNs:
                FNs_set[tweet_text[x[0]:x[1]]].add(
                    (key, tweet_text[x[0] - 2:x[1] + 2], x))
    '''
    since we add 2 lns one from lnex_lns and one from tweet_lns if they
    overlap the equation of counting those as 1/2 FPs and 1/2 FNs is going
    to be:
        overlaps_count x
            1/2 (since we count twice) x
                1/2 (since we want 1/2 of all the errors made)
    '''

    Precision = TPs_count / (TPs_count + FPs_count + 0.5 * .5 * overlaps_count)
    Recall = TPs_count / (TPs_count + FNs_count + 0.5 * .5 * overlaps_count)
    F_Score = (2 * Precision * Recall) / (Precision + Recall)

    #percentage_disambiguated = one_geolocation/all_geolocation

    return {"precision": Precision, "recall": Recall, "f-score": F_Score}