def run(arguments): print('****************************') print(' Start Get Wiki') print('****************************') print('\nArguments:') for arg in sorted(arguments.keys()): print(' - {:10} : {}'.format(arg, arguments[arg])) print() # Validate arguments is_arguments_ok = True if not arguments['term']: logger.info( '[INTERUPTION] No term has been given. Please add a search term using -t or --term. Use -h, --help for help.' ) is_arguments_ok = False if is_arguments_ok: logger.info('Arguments are valid. Starting to scrap Wikipedia...') wf = WikiFetcher(arguments['term'], suggest=arguments['suggest'], n_results=arguments['n_results']) wf.fetch()
def __init__(self, query, geojson_path, place_claim="P19", data_path=None): self.wf = WikiFetcher() self.pr = PlaceResolver("places.json") self.STEP = 50 self.DEBUG_DIR = "debug" self.place_claim = place_claim self.db = json.loads('{"entities": {} }') self.place_info_cnt = 0 items = self.wf.get_items_from_query(query) print("Fetched " + str(len(items)) + " from query...") self.process_items(items) self.write_geojson(geojson_path) self.save_database(data_path, geojson_path) self.pr.close_failed_places() print("Written " + str(self.place_info_cnt) + " items with geoinformation to geojson file...") print("(Equals " + str(round(100 * (self.place_info_cnt / len(items)), 2)) + "%)")
class GeoJSONWriter(): def __init__(self, query, geojson_path, place_claim="P19", data_path=None): self.wf = WikiFetcher() self.pr = PlaceResolver("places.json") self.STEP = 50 self.DEBUG_DIR = "debug" self.place_claim = place_claim self.db = json.loads('{"entities": {} }') self.place_info_cnt = 0 items = self.wf.get_items_from_query(query) print("Fetched " + str(len(items)) + " from query...") self.process_items(items) self.write_geojson(geojson_path) self.save_database(data_path, geojson_path) self.pr.close_failed_places() print("Written " + str(self.place_info_cnt) + " items with geoinformation to geojson file...") print("(Equals " + str(round(100 * (self.place_info_cnt / len(items)), 2)) + "%)") def save_database(self, data_path, geojson_path): if (data_path == None): data_path = geojson_path.replace(".geojson", "-data.json") dir_path = os.path.dirname(data_path) if (dir_path != ""): if (os.path.exists(dir_path) == False): os.makedirs(dir_path) fh = open(data_path, "w+") json.dump(self.db, fh) fh.close() def write_geojson(self, geojson_path): geojson = json.loads("""{ "type": "FeatureCollection", "crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:EPSG::4326" } }, "features": [] }""") for entity in self.db["entities"]: # TODO: set properties to be written via cmd-line #print(entity) place_id = None # check if 625 already in object itself (then use own wikidata-id) # Problem: double processing of the same entity if ("P625" in self.db["entities"][entity]["claims"]): place_id = entity place_info, exception = self.pr.get_place_info_from_json(self.db["entities"][entity]) feature = self.build_feature(entity, place_info["name"], [place_info["lon"], place_info["lat"]]) geojson["features"].append(feature) self.place_info_cnt += 1 elif (self.place_claim in self.db["entities"][entity]["claims"]): try: place_id = self.db["entities"][entity]["claims"][self.place_claim][0]["mainsnak"]["datavalue"]["value"]["numeric-id"] place_id = "Q" + str(place_id) if (self.pr.has_place(place_id)): if (self.pr.place_not_none(place_id)): name = self.get_name(self.db["entities"][entity]) coords = self.pr.return_lon_lat(place_id) feature = self.build_feature(entity, name, coords) #feature["properties"]["name"] = self.get_name(self.db["entities"][entity]) #feature["geometry"]["coordinates"] = self.pr.return_lon_lat(place_id) geojson["features"].append(feature) self.place_info_cnt += 1 except Exception as e: print("Exception processing entity: " + entity) print(e) fh = open(geojson_path, "w+") json.dump(geojson, fh) fh.close() def build_feature(self, wikiid, name, coords): feature = json.loads("""{ "properties": { }, "geometry": { "type": "Point", "coordinates": [null, null] }, "type": "Feature" }""") feature["properties"]["wikiid"] = wikiid feature["properties"]["name"] = name feature["geometry"]["coordinates"] = coords return feature def get_name(self, entity): if ("en" in entity["labels"]): if ("value" in entity["labels"]["en"]): return entity["labels"]["en"]["value"] # No value for en, check for sitelinks for link in entity["sitelinks"]: if ("title" in entity["sitelinks"][link]): if (entity["sitelinks"][link]["title"] != ""): return entity["sitelinks"][link]["title"] return "" def process_items(self, items): cnt = 0 length = len(items) while (cnt < length): print("Processing items " + str(cnt) + " to " + str(cnt + self.STEP)) curr_id_list = items[cnt:cnt+self.STEP] curr_json = self.wf.get_json_for_id_list(curr_id_list) for entity in curr_json["entities"]: self.db["entities"][entity] = curr_json["entities"][entity] if (self.place_claim in curr_json["entities"][entity]["claims"]): try: curr_place_id = "Q" + str(curr_json["entities"][entity]["claims"][self.place_claim][0]["mainsnak"]["datavalue"]["value"]["numeric-id"]) if (self.pr.has_place(curr_place_id) == False): self.pr.append_id_to_resolver(curr_place_id) except Exception as e: print("Exception: ") print(e) cnt += self.STEP self.pr.resolve_pending_ids()
class PlaceResolver(): def __init__(self, db_path): self.wf = WikiFetcher() self.db = self.init_database(db_path) self.resolve_list = [] self.failed_places_no_point = open("failed_places.info", "w+") # holds list of non retrieval places # (because refers to an object without an 625 Property (geolocation for a point)) def close_failed_places(self): self.failed_places_no_point.close() def append_id_to_resolver(self, place_id): self.resolve_list.append(place_id) def has_place(self, place_id): if (place_id in self.db): return True else: return False def place_not_none(self, place_id): try: return self.db[place_id] != None except: return False def return_lat_lon(self, place_id): place_info = self.db[place_id] return [place_info["lat"], place_info["lon"]] def return_lon_lat(self, place_id): place_info = self.db[place_id] return [place_info["lon"], place_info["lat"]] def resolve_pending_ids(self): if (len(self.resolve_list) == 0): return False new_resolve_list = [] for item in self.resolve_list: if (item not in self.db): new_resolve_list.append(item) #print(self.resolve_list) json_resp = self.wf.get_json_for_id_list(new_resolve_list) if ("entities" in json_resp): for place in json_resp["entities"]: if (place not in self.db): #print("Resolving " + place + "...") place_info, exception = self.get_place_info_from_json(json_resp["entities"][place]) self.db[place] = place_info if (exception): if (exception.args[0] == "P625"): if place not in self.failed_places_no_point: self.failed_places_no_point.write(place + "\n") self.failed_places_no_point.flush() def get_place_info_from_json(self, json): try: lat = json["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"]["latitude"] lon = json["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"]["longitude"] try: name = json["labels"]["en"]["value"] except: try: name = json["claims"]["P373"][0]["mainsnak"]["datavalue"]["value"] except: name = "" return { "lat": lat, "lon": lon, "name": name }, None except Exception as e: #print(json_resp) print(e) return None, e # Returns a json object with place-information def init_database(self, db_path): return json.loads("{}") def save_database(self, db_path): if (os.path.exists(os.path.dirname(db_path)) == False): os.mkdir(os.path.dirname(db_path)) fh = open(db_path, "w+") json.dump(self.db, fh) fh.close() print("Data was saved to " + db_path + "!")
def __init__(self, db_path): self.wf = WikiFetcher() self.db = self.init_database(db_path) self.resolve_list = [] self.failed_places_no_point = open("failed_places.info", "w+") # holds list of non retrieval places