def run(arguments):
    print('****************************')
    print('       Start Get Wiki')
    print('****************************')

    print('\nArguments:')
    for arg in sorted(arguments.keys()):
        print('  - {:10} : {}'.format(arg, arguments[arg]))
    print()

    # Validate arguments

    is_arguments_ok = True

    if not arguments['term']:
        logger.info(
            '[INTERUPTION] No term has been given. Please add a search term using -t or --term. Use -h, --help for help.'
        )
        is_arguments_ok = False

    if is_arguments_ok:
        logger.info('Arguments are valid. Starting to scrap Wikipedia...')

        wf = WikiFetcher(arguments['term'],
                         suggest=arguments['suggest'],
                         n_results=arguments['n_results'])
        wf.fetch()
Ejemplo n.º 2
0
 def __init__(self, query, geojson_path, place_claim="P19", data_path=None):
     self.wf = WikiFetcher()
     self.pr = PlaceResolver("places.json")
     self.STEP = 50
     self.DEBUG_DIR = "debug"
     self.place_claim = place_claim
     self.db = json.loads('{"entities": {} }')
     self.place_info_cnt = 0
     items = self.wf.get_items_from_query(query)
     print("Fetched " + str(len(items)) + " from query...")
     self.process_items(items)
     self.write_geojson(geojson_path)
     self.save_database(data_path, geojson_path)
     self.pr.close_failed_places()
     print("Written " + str(self.place_info_cnt) + " items with geoinformation to geojson file...")
     print("(Equals " + str(round(100 * (self.place_info_cnt / len(items)), 2)) + "%)")
Ejemplo n.º 3
0
class GeoJSONWriter():

    def __init__(self, query, geojson_path, place_claim="P19", data_path=None):
        self.wf = WikiFetcher()
        self.pr = PlaceResolver("places.json")
        self.STEP = 50
        self.DEBUG_DIR = "debug"
        self.place_claim = place_claim
        self.db = json.loads('{"entities": {} }')
        self.place_info_cnt = 0
        items = self.wf.get_items_from_query(query)
        print("Fetched " + str(len(items)) + " from query...")
        self.process_items(items)
        self.write_geojson(geojson_path)
        self.save_database(data_path, geojson_path)
        self.pr.close_failed_places()
        print("Written " + str(self.place_info_cnt) + " items with geoinformation to geojson file...")
        print("(Equals " + str(round(100 * (self.place_info_cnt / len(items)), 2)) + "%)")

    def save_database(self, data_path, geojson_path):
        if (data_path == None):
            data_path = geojson_path.replace(".geojson", "-data.json")
        dir_path = os.path.dirname(data_path)
        if (dir_path != ""):
            if (os.path.exists(dir_path) == False):
                os.makedirs(dir_path)
        fh = open(data_path, "w+")
        json.dump(self.db, fh)
        fh.close()

    def write_geojson(self, geojson_path):
        geojson = json.loads("""{
            "type": "FeatureCollection",
            "crs": {
                "type": "name",
                "properties": { "name": "urn:ogc:def:crs:EPSG::4326" }
            },
            "features": []
        }""")
        for entity in self.db["entities"]:
            # TODO: set properties to be written via cmd-line
            #print(entity)
            place_id = None
            # check if 625 already in object itself (then use own wikidata-id) 
            # Problem: double processing of the same entity
            if ("P625" in self.db["entities"][entity]["claims"]): 
                place_id = entity
                place_info, exception = self.pr.get_place_info_from_json(self.db["entities"][entity])
                feature = self.build_feature(entity, place_info["name"], [place_info["lon"], place_info["lat"]])
                geojson["features"].append(feature)
                self.place_info_cnt += 1
                
            elif (self.place_claim in self.db["entities"][entity]["claims"]):
                try:
                    place_id = self.db["entities"][entity]["claims"][self.place_claim][0]["mainsnak"]["datavalue"]["value"]["numeric-id"]
                    place_id = "Q" + str(place_id)
                    if (self.pr.has_place(place_id)):
                        if (self.pr.place_not_none(place_id)):
                            name = self.get_name(self.db["entities"][entity])
                            coords = self.pr.return_lon_lat(place_id)
                            feature = self.build_feature(entity, name, coords)
                            #feature["properties"]["name"] = self.get_name(self.db["entities"][entity])
                            #feature["geometry"]["coordinates"] = self.pr.return_lon_lat(place_id)
                            geojson["features"].append(feature)
                            self.place_info_cnt += 1
                except Exception as e:
                    print("Exception processing entity: " + entity)
                    print(e)
                
        fh = open(geojson_path, "w+")
        json.dump(geojson, fh)
        fh.close()

    def build_feature(self, wikiid, name, coords):
        feature = json.loads("""{
            "properties": { },
            "geometry": {
                "type": "Point",
                "coordinates": [null, null]
            },
            "type": "Feature"
        }""")
        feature["properties"]["wikiid"] = wikiid
        feature["properties"]["name"] = name
        feature["geometry"]["coordinates"] = coords
        
        return feature
        
    def get_name(self, entity):
        if ("en" in entity["labels"]):
            if ("value" in entity["labels"]["en"]):
                return entity["labels"]["en"]["value"]
        # No value for en, check for sitelinks
        for link in entity["sitelinks"]:
            if ("title" in entity["sitelinks"][link]):
                if (entity["sitelinks"][link]["title"] != ""):
                    return entity["sitelinks"][link]["title"]
        return ""

    def process_items(self, items):
        cnt = 0
        length = len(items)

        while (cnt < length):
            print("Processing items " + str(cnt) + " to " + str(cnt + self.STEP))
            curr_id_list = items[cnt:cnt+self.STEP]
            curr_json = self.wf.get_json_for_id_list(curr_id_list)
            for entity in curr_json["entities"]:
                self.db["entities"][entity] = curr_json["entities"][entity]
                if (self.place_claim in curr_json["entities"][entity]["claims"]):
                    try:
                        curr_place_id = "Q" + str(curr_json["entities"][entity]["claims"][self.place_claim][0]["mainsnak"]["datavalue"]["value"]["numeric-id"])
                        if (self.pr.has_place(curr_place_id) == False):
                            self.pr.append_id_to_resolver(curr_place_id)
                    except Exception as e:
                        print("Exception: ")
                        print(e)
            cnt += self.STEP
            self.pr.resolve_pending_ids()
Ejemplo n.º 4
0
class PlaceResolver():

    def __init__(self, db_path):
        self.wf = WikiFetcher()
        self.db = self.init_database(db_path)
        self.resolve_list = []
        self.failed_places_no_point = open("failed_places.info", "w+") # holds list of non retrieval places
                                         # (because refers to an object without an 625 Property (geolocation for a point))

    def close_failed_places(self):
        self.failed_places_no_point.close()

    def append_id_to_resolver(self, place_id):
        self.resolve_list.append(place_id)

    def has_place(self, place_id):
        if (place_id in self.db):
            return True
        else:
            return False

    def place_not_none(self, place_id):
        try:
            return self.db[place_id] != None
        except:
            return False

    def return_lat_lon(self, place_id):
        place_info = self.db[place_id]
        return [place_info["lat"], place_info["lon"]]

    def return_lon_lat(self, place_id):
        place_info = self.db[place_id]
        return [place_info["lon"], place_info["lat"]]

    def resolve_pending_ids(self):
        if (len(self.resolve_list) == 0):
            return False
        new_resolve_list = []
        for item in self.resolve_list:
            if (item not in self.db):
                new_resolve_list.append(item)
        #print(self.resolve_list)
        json_resp = self.wf.get_json_for_id_list(new_resolve_list)
        if ("entities" in json_resp):
            for place in json_resp["entities"]:
                if (place not in self.db):
                    #print("Resolving " + place + "...")
                    place_info, exception = self.get_place_info_from_json(json_resp["entities"][place]) 
                    self.db[place] = place_info 
                    if (exception): 
                        if (exception.args[0] == "P625"):
                            if place not in self.failed_places_no_point:
                                self.failed_places_no_point.write(place + "\n")
                                self.failed_places_no_point.flush()

    def get_place_info_from_json(self, json): 
        try:
            lat = json["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"]["latitude"]
            lon = json["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"]["longitude"]
            try:
                name = json["labels"]["en"]["value"]
            except:
                try:
                    name = json["claims"]["P373"][0]["mainsnak"]["datavalue"]["value"]
                except:
                    name = ""

            return {
                "lat": lat,
                "lon": lon,
                "name": name
            }, None
        except Exception as e:
            #print(json_resp)
            print(e)
            return None, e
            
    # Returns a json object with place-information
    def init_database(self, db_path):
        return json.loads("{}")

    def save_database(self, db_path):
        if (os.path.exists(os.path.dirname(db_path)) == False):
            os.mkdir(os.path.dirname(db_path))
        fh = open(db_path, "w+")
        json.dump(self.db, fh)
        fh.close()
        print("Data was saved to " + db_path + "!")
Ejemplo n.º 5
0
 def __init__(self, db_path):
     self.wf = WikiFetcher()
     self.db = self.init_database(db_path)
     self.resolve_list = []
     self.failed_places_no_point = open("failed_places.info", "w+") # holds list of non retrieval places