out = codecs.open(output_file, u"w", encoding=u"utf-8") for line in codecs.open(input_file, u"r", encoding=u"utf-8"): counter += 1 if counter < start: continue if counter > finish: break if counter % frequency == 0: if not filtering: out.write(line) saved_count += 1 else: split = line.split(u"\t") wiki_coordinates = (float(split[0]), float(split[1])) name = u" ".join(eval(split[5])).strip() db_coordinates = get_coordinates(c, name) distance = [] for candidate in db_coordinates: distance.append(great_circle(wiki_coordinates, (float(candidate[0]), float(candidate[1]))).kilometers) distance = sorted(distance) if distance[0] > max_distance: print(name, distance[0]) filtered_count += 1 else: out.write(line) saved_count += 1 print(u"Saved", saved_count, u"samples.") if filtering: print(u"Filtered:", filtered_count, u"samples.")
def geoparse(text, result_string, model, word_to_index): """ This function allows one to geoparse text i.e. extract toponyms (place names) and disambiguate to coordinates. :param text: to be parsed :return: currently only prints results to the screen, feel free to modify to your task """ nlp = spacy.load( u'en' ) # or spacy.load(u'en') depending on your Spacy Download (simple or full) conn = sqlite3.connect( u'/opt/gsda/EUPEG/Geoparsers/camCoder/data/geonames.db').cursor( ) # this DB can be downloaded using the GitHub link padding = nlp(u"0")[0] # Do I need to explain? :-) doc = nlp(text) # NER with Spacy NER for entity in doc.ents: if entity.label_ in [u"GPE", u"FACILITY", u"LOC", u"FAC", u"LOCATION"]: name = entity.text if not entity.text.startswith( 'the') else entity.text[4:].strip() start = entity.start_char if not entity.text.startswith( 'the') else entity.start_char + 4 end = entity.end_char near_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, entity.start - CONTEXT_LENGTH / 2):entity.start]], True, padding) + \ pad_list(CONTEXT_LENGTH / 2, [x for x in doc[entity.end: entity.end + CONTEXT_LENGTH / 2]], False, padding) far_inp = pad_list(CONTEXT_LENGTH / 2, [x for x in doc[max(0, entity.start - CONTEXT_LENGTH):max(0, entity.start - CONTEXT_LENGTH / 2)]], True, padding) + \ pad_list(CONTEXT_LENGTH / 2, [x for x in doc[entity.end + CONTEXT_LENGTH / 2: entity.end + CONTEXT_LENGTH]], False, padding) map_vector = text2mapvec(doc=near_inp + far_inp, mapping=ENCODING_MAP_1x1, outliers=OUTLIERS_MAP_1x1, polygon_size=1, db=conn, exclude=name) context_words, entities_strings = [], [] target_string = pad_list(TARGET_LENGTH, [x.text.lower() for x in entity], True, u'0') target_string = [ word_to_index[x] if x in word_to_index else word_to_index[UNKNOWN] for x in target_string ] for words in [near_inp, far_inp]: for word in words: if word.text.lower() in word_to_index: vec = word_to_index[word.text.lower()] else: vec = word_to_index[UNKNOWN] if word.ent_type_ in [ u"GPE", u"FACILITY", u"LOC", u"FAC", u"LOCATION" ]: entities_strings.append(vec) context_words.append(word_to_index[u'0']) elif word.is_alpha and not word.is_stop: context_words.append(vec) entities_strings.append(word_to_index[u'0']) else: context_words.append(word_to_index[u'0']) entities_strings.append(word_to_index[u'0']) prediction = model.predict([ np.array([context_words]), np.array([context_words]), np.array([entities_strings]), np.array([entities_strings]), np.array([map_vector]), np.array([target_string]) ]) prediction = index_to_coord( REVERSE_MAP_2x2[np.argmax(prediction[0])], 2) candidates = get_coordinates(conn, name) if len(candidates) == 0: # print(u"Don't have an entry for", name, u"in GeoNames") continue max_pop = candidates[0][2] best_candidate = [] bias = 0.905 # Tweak the parameter depending on the domain you're working with. # Less than 0.9 suitable for ambiguous text, more than 0.9 suitable for less ambiguous locations, see paper for candidate in candidates: err = great_circle( prediction, (float(candidate[0]), float(candidate[1]))).km best_candidate.append( (err - (err * max(1, candidate[2]) / max(1, max_pop)) * bias, (float(candidate[0]), float(candidate[1])))) best_candidate = sorted(best_candidate, key=lambda (a, b): a)[0] # England,, England,, 51.5,, -0.11,, 669,, 676 || - use evaluation script to test correctness # print name, start, end # print u"Coordinates:", best_candidate[1] name = name.encode('utf-8') one_toponym = "{0},,{1},,{2},,{3},,{4},,{5}||".format( name, name, best_candidate[1][0], best_candidate[1][1], start, end) result_string = result_string + one_toponym return result_string
print(u'Crunching numbers, sit tight...') # errors = codecs.open(u"errors.tsv", u"w", encoding=u"utf-8") # Uncomment the above line for error diagnostics, also the section below. conn = sqlite3.connect(u'../data/geonames.db') file_name = u"data/eval_" + test_data + u".txt" final_errors = [] print("Processing file..." + file_name) for prediction, (y, name, context) in zip( model.predict_generator( generate_arrays_from_file(file_name, word_to_index, train=False), steps=int(check_output([u"wc", file_name]).split()[0]) / BATCH_SIZE, verbose=True), generate_strings_from_file(file_name)): prediction = index_to_coord(REVERSE_MAP_2x2[np.argmax(prediction)], 2) candidates = get_coordinates(conn.cursor(), name) if len(candidates) == 0: print(u"Don't have an entry for", name, u"in GeoNames") raise Exception(u"Check your database, buddy :-)") # candidates = [candidates[0]] # Uncomment for population heuristic. # THE ABOVE IS THE POPULATION ONLY BASELINE IMPLEMENTATION #print("Prediction..."+" ".join(y)+"..."+name+".."+str(context)) print("Prediction..." + " " + str(y) + "...." + name + ".." + context) best_candidate = [] max_pop = candidates[0][2] bias = 0.905 # the Bias parameter in the paper for candidate in candidates: print(candidate)