def init_using_elasticindex(bb): lnex.elasticindex(conn_string='130.108.85.186:9200', index_name="photon_v1") #lnex.elasticindex(conn_string='localhost:9201', index_name="photon") return lnex.initialize(bb, augment=True)
def init_using_elasticindex(gaz_name): lnex.elasticindex(conn_string='173.193.79.31:31169', index_name="photon") if gaz_name == "chennai": # chennai flood bounding box bb = [12.74, 80.066986084, 13.2823848224, 80.3464508057] elif gaz_name == "houston": bb = [29.4778611958, -95.975189209, 30.1463147381, -94.8889160156] print(bb) return lnex.initialize(bb, augment=True)
def init_using_elasticindex(bb, cache, augmentType, dataset, capital_word_shape): lnex.elasticindex(conn_string='localhost:9200', index_name="photon") geo_info = lnex.initialize(bb, augmentType=augmentType, cache=cache, dataset_name=dataset, capital_word_shape=capital_word_shape) return geo_info
def init_using_files(): data_folder = os.path.join("..", "_Data") with open(data_folder + "/chennai_geo_locations.json") as f: geo_locations = json.load(f) with open(data_folder + "/chennai_geo_info.json") as f: geo_info = json.load(f) with open(data_folder + "/chennai_extended_words3.json") as f: extended_words3 = json.load(f) lnex.initialize_using_files(geo_locations, extended_words3) return geo_info
def train(cls, train_dev_set): ''' This method will prepare gazetteer as it as a statistical model no training is required :param train_set: set to null :return: t gazetteer initialized ''' bbs = {"chennai": [12.74, 80.066986084, 13.2823848224, 80.3464508057]} dataset = "chennai" lnex.elasticindex(conn_string='localhost:9200', index_name="photon") geo_info = lnex.initialize(bbs[dataset], augmentType="HP", cache=False, dataset_name=dataset, capital_word_shape=False) return geo_info
def init_using_files(dataset, capital_word_shape): with open("_Data/Cached_Gazetteers/" + dataset + "_geo_locations.json") as f: geo_locations = json.load(f) with open("_Data/Cached_Gazetteers/" + dataset + "_extended_words3.json") as f: extended_words3 = json.load(f) with open("_Data/Cached_Gazetteers/" + dataset + "_geo_info.json") as f: geo_info = json.load(f) lnex.initialize_using_files(geo_locations, extended_words3, capital_word_shape=capital_word_shape) return geo_info
def predict(cls, model, test_set): ''' The method extracts location based on the statistical model and outputs the following results :param model: a trained model :param test_set: a list of test data :return: returns a list of the following 4 items list: tweet_mention, mention_offsets, geo_location, geo_info_id tweet_mention: is the location mention in the tweet (substring retrieved from the mention offsets) mention_offsets: a tuple of the start and end offsets of the LN geo_location: the matched location name from the gazetteer. e.g., new avadi rd > New Avadi Road geo_info_id: s contains the attached metadata of all the matched location names from the gazetteer ''' output_list = [] for tweet in test_set: for output in lnex.extract(tweet): print(output[0], output[1], output[2], output[3]["main"]) temp = [output[0], output[1], output[2], output[3]["main"]] output_list.append(temp) print("#" * 50) ditk_path = "" for path in sys.path: if "ditk" in path: ditk_path = path output_file = ditk_path + "/entity_linkage/normalization/lnex/result/output.txt" if not os.path.exists(os.path.dirname(output_file)): os.makedirs(os.path.dirname(output_file), exist_ok=True) with open(output_file, "w") as f: for tweet in test_set: for output in lnex.extract(tweet): f.write( str(output[0]) + ", " + str(output[1]) + ", " + str(output[2]) + ", " + str(output[3]["main"]) + "\n") return output_list
def evaluate(cls, clf, eval_set): ''' :param model: a trained model :param eval_set: a list of validation data :return: (precision, recall, f1 score) ''' bbs = {"chennai": [12.74, 80.066986084, 13.2823848224, 80.3464508057]} results = dict() dataset = "chennai" print(dataset) lnex.initialize(bbs[dataset], augmentType="FULL", cache=False, dataset_name=dataset, capital_word_shape=False) anns = eval_set results = eval_main.evaluate(anns) return results
def prepare_geo_points(gaz_name, geo_info): os.environ['NO_PROXY'] = '127.0.0.1' all_geo_points = list() es = Elasticsearch([{'host': '173.193.79.31', 'port': 31169}]) for tweet in get_all_tweets_and_annotations(gaz_name): classes = natural_language_classifier.classify('6876e8x557-nlc-635', tweet[0]) r = classes['top_class'] # r="shelter_matching" if r == "shelter_matching": cl = "shelter_matching" i = '/static/shelter.png' elif r == "infrastructure_need": cl = "infrastructure_need" i = '/static/utility_infrastructure' elif r == "rescue_match": cl = "rescue_match" i = '/static/medical_need.png' else: cl = "not_related_or_irrelevant" i = '' for ln in lnex.extract(tweet[0]): if ln[0].lower() == gaz_name.lower(): continue ln_offsets = ln[1] geoinfo = [geo_info[x] for x in ln[3]] if len(geoinfo) == 0: continue for geopoint in geoinfo: lat = geopoint["geo_item"]["point"]["lat"] lon = geopoint["geo_item"]["point"]["lon"] try: fl = flooded(lat, lon) # print str(fl) if str(fl) == 'True': fld = True else: fld = False es.index(index=gaz_name + '-tweetneeds', doc_type='doc', body={ "type": "Feature", "geometry": { "type": "Point", "coordinates": [lon, lat] }, "properties": { "locationMention": { "text": ln[0], "offsets": [ln_offsets[0], ln_offsets[1]] }, "tweetID": tweet[1], "text": tweet[0], "createdAt": tweet[2], "needClass": cl, "flooded": fld, "image": tweet[3] } }) all_geo_points.append({ "type": "Feature", "geometry": { "type": "Point", "coordinates": [lon, lat] }, "properties": { "locationMention": { "text": ln[0], "offsets": [ln_offsets[0], ln_offsets[1]] }, "tweetID": tweet[1], "text": tweet[0], "createdAt": tweet[2], "needClass": cl, "flooded": fld, "image": tweet[3] } }) # print (all_geo_points) except Exception as e: print e continue print(len(all_geo_points)) return {"type": "FeatureCollection", "features": all_geo_points}
def init_using_elasticindex(bb, cache, augmentType, dataset, capital_word_shape): lnex.elasticindex(conn_string='localhost:9200', index_name="photon") geo_info = lnex.initialize(bb, augmentType=augmentType, cache=cache, dataset_name=dataset, capital_word_shape=capital_word_shape) return geo_info bbs = { "chennai": [12.74, 80.066986084, 13.2823848224, 80.3464508057], "louisiana": [29.4563, -93.3453, 31.4521, -89.5276], "houston": [29.4778611958, -95.975189209, 30.1463147381, -94.8889160156] } dataset = "chennai" geo_info = init_using_elasticindex(bbs[dataset], cache=False, augmentType="HP", dataset=dataset, capital_word_shape=False) for tweet in read_tweets(): for output in lnex.extract(tweet): print(output[0], output[1], output[2], output[3]["main"]) print("#" * 50)
def prepare_geo_points(gaz_name, geo_info): os.environ['NO_PROXY'] = '127.0.0.1' all_geo_points = list() es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) count = 0 for tweet in get_all_tweets_and_annotations(gaz_name): txt = 'http://127.0.0.1:8089/classify?text="' + tweet[0] + '"' r = requests.get(txt) print r.content if r.content == "shelter_matching": cl = "shelter_matching" i = '/static/shelter.png' elif r.content == "infrastructure_need": cl = "infrastructure_need" i = '/static/utility_infrastructure' else: cl = "rescue_match" i = '/static/medical_need.png' for ln in lnex.extract(tweet[0]): if ln[0].lower() == gaz_name.lower(): continue ln_offsets = ln[1] geoinfo = [geo_info[x] for x in ln[3]] if len(geoinfo) == 0: continue for geopoint in geoinfo: lat = geopoint["geo_item"]["point"]["lat"] lon = geopoint["geo_item"]["point"]["lon"] marked_tweet = tweet[0][:ln_offsets[0]] + "<mark>" + tweet[0][ ln_offsets[0]:ln_offsets[1]] + "</mark>" + tweet[0][ ln_offsets[1]:] try: description = """ <table> <tr> <td colspan="2">marked_tweet</td> </tr> </table> """ description = description.replace("marked_tweet", marked_tweet) marker_icon = "marker" fl = flooded(lat, lon) print str(fl) if str(fl) == 'True': fld = 'True' else: fld = 'False' es.index(index=gaz_name + '-tweets', doc_type='people', id=count, body={ "type": "Feature", "geometry": { "type": "Point", "coordinates": [lon, lat] }, "properties": { "description": description, "icon": marker_icon, "id": tweet[1], "tweet": tweet[0], "timestamp": tweet[2], "class": cl, "URL": i, "Flood": fld } }) all_geo_points.append({ "type": "Feature", "geometry": { "type": "Point", "coordinates": [lon, lat] }, "properties": { "description": description, "icon": marker_icon, "id": tweet[1], "tweet": tweet[0], "timestamp": tweet[2], "class": cl, "URL": i, "Flood": fld } }) count = count + 1 except Exception as e: print e print "ERROR:>>> ", marked_tweet exit() #es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) #for i in range(len(all_geo_points)): #es.index(index='some_try', doc_type='people', id=i, body=all_geo_points[i]) return {"type": "FeatureCollection", "features": all_geo_points}
def prepare_crowd_source(gaz_name): geo_info = init_using_elasticindex(gaz_name) es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) data = xlrd.open_workbook("other_resources.xlsx") sheet_no = data.sheet_by_index(0) x = [] y = [] crowd_data = [] count = 0 for i in range(sheet_no.nrows): if i >= 1: x.append(sheet_no.cell(i, 0).value) y.append(sheet_no.cell(i, 1).value) for text, cls in zip(x, y): text = strip_non_ascii(text) print text, cls for ln in lnex.extract(text): if ln[0].lower() == gaz_name.lower(): continue ln_offsets = ln[1] geoinfo = [geo_info[x] for x in ln[3]] if len(geoinfo) == 0: continue for geopoint in geoinfo: lat = geopoint["geo_item"]["point"]["lat"] lon = geopoint["geo_item"]["point"]["lon"] marked_tweet = text[:ln_offsets[0]] + "<mark>" + text[ ln_offsets[0]:ln_offsets[1]] + "</mark>" + text[ ln_offsets[1]:] try: description = """ <table> <tr> <td colspan="2">marked_tweet</td> </tr> </table> """ description = description.replace("marked_tweet", marked_tweet) marker_icon = "marker" fl = flooded(lat, lon) print str(fl) if str(fl) == 'True': fld = 'True' else: fld = 'False' es.index(index=gaz_name + '-crowd', doc_type='crowd', id=count, body={ "type": "Feature", "geometry": { "type": "Point", "coordinates": [lon, lat] }, "properties": { "description": description, "icon": marker_icon, "tweet": text, "class": cls, "Flood": fld } }) crowd_data.append({ "type": "Feature", "geometry": { "type": "Point", "coordinates": [lon, lat] }, "properties": { "description": description, "icon": marker_icon, "tweet": text, "class": cls, "Flood": fld } }) count = count + 1 except Exception as e: print e print "ERROR:>>> ", marked_tweet exit()
tweet_lns = set() lnex_lns = set() tweet_text = "" for ann in anns[key]: if ann != "text": ln = anns[key][ann] tweet_lns.add(((int(ln['start_idx']), int(ln['end_idx'])), ln['type'])) else: tweet_text = anns[key][ann] #print tweet_text r = lnex.extract(tweet_text) # how many are already disambiguated +++++++++++++++++++++++ for res in r: if len(res[3]) < 2: one_geolocation += 1 #if len(res[3]) == 0: #print res[2] else: geo_codes_length_dist[len(res[3])] += 1 all_geolocation += 1 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lnex_lns = set([x[1] for x in r])
def evaluate(anns): TPs_count = 0 FPs_count = 0 FNs_count = 0 overlaps_count = 0 #fns = defaultdict(int) count = 0 one_geolocation = 0 all_geolocation = 0 geo_codes_length_dist = defaultdict(int) FPs_set = defaultdict(set) FNs_set = defaultdict(set) for key in list(anns.keys()): count += 1 # skip the development set #if dataset != "houston" and count < 500: # continue tweet_lns = set() lnex_lns = set() tweet_text = "" for ann in anns[key]: if ann != "text": ln = anns[key][ann] tweet_lns.add( ((int(ln['start_idx']), int(ln['end_idx'])), ln['type'])) else: tweet_text = anns[key][ann] r = lnex.extract(tweet_text) # how many are already disambiguated +++++++++++++++++++++++ for res in r: if len(res[3]) < 2: one_geolocation += 1 #if len(res[3]) == 0: #print res[2] else: geo_codes_length_dist[len(res[3])] += 1 all_geolocation += 1 # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lnex_lns = set([x[1] for x in r]) tweet_lns = set([x[0] for x in tweet_lns if x[1] == "inLoc"]) # True Positives +++++++++++++++++++++++++++++++++++++++++++++++++++ TPs = tweet_lns.intersection(lnex_lns) TPs_count += len(TPs) # Left in both sets ++++++++++++++++++++++++++++++++++++++++++++++++ tweet_lns -= TPs lnex_lns -= TPs # Find Overlapping LNs to be counted as 1/2 FPs and 1/2 FNs++ overlaps = set() for x in tweet_lns: for y in lnex_lns: if do_they_overlap(x, y): overlaps.add(x) overlaps.add(y) overlaps_count += len(overlaps) # remove the overlapping lns from lnex_lns and tweet_lns lnex_lns -= overlaps tweet_lns -= overlaps # False Positives ++++++++++++++++++++++++++++++++++++++++++++++++++ # lnex_lns = all - (TPs and overlaps and !inLoc) FPs = lnex_lns - tweet_lns FPs_count += len(FPs) if len(FPs) > 0: for x in FPs: FPs_set[tweet_text[x[0]:x[1]]].add( (key, tweet_text[x[0] - 2:x[1] + 2], x)) # False Negatives ++++++++++++++++++++++++++++++++++++++++++++++++++ FNs = tweet_lns - lnex_lns FNs_count += len(FNs) if len(FNs) > 0: for x in FNs: FNs_set[tweet_text[x[0]:x[1]]].add( (key, tweet_text[x[0] - 2:x[1] + 2], x)) ''' since we add 2 lns one from lnex_lns and one from tweet_lns if they overlap the equation of counting those as 1/2 FPs and 1/2 FNs is going to be: overlaps_count x 1/2 (since we count twice) x 1/2 (since we want 1/2 of all the errors made) ''' Precision = TPs_count / (TPs_count + FPs_count + 0.5 * .5 * overlaps_count) Recall = TPs_count / (TPs_count + FNs_count + 0.5 * .5 * overlaps_count) F_Score = (2 * Precision * Recall) / (Precision + Recall) #percentage_disambiguated = one_geolocation/all_geolocation return {"precision": Precision, "recall": Recall, "f-score": F_Score}