def process_dupes(): items = {} qids = [] dupes = [] # First dedupe by QID for item in Knead("data/reliwiki/query.csv").data(): if item["item"] not in qids: reliwiki = item["reliwiki"] if reliwiki not in items: items[reliwiki] = [item] else: items[reliwiki].append(item) qids.append(item["item"]) for values in items.values(): if len(values) > 1: dupes = dupes + values Knead(dupes).write("data/reliwiki/dupes.csv", fieldnames=[ "item", "reliwiki", "itemLabel", "itemDescription", "instanceLabel", "instance" ])
def find_titles(): SPLIT_DASH = re.compile(" –|- ") def get_title(pageid): html_path = f"data/reliwiki/html/{pageid}.html" # print(f"Extracing title from {html_path}") with open(html_path) as f: soup = BeautifulSoup(f.read(), "lxml") title = soup.select_one("title").get_text().strip() parts = SPLIT_DASH.split(title)[1:] title = "- ".join(parts).replace("- Reliwiki", "").strip() return title titles = [] for item in Knead("data/reliwiki/churches_gsheet.csv").data(): if item["name"] != "": continue # Try to extract from the page title pageid = item["pageid"] try: title = get_title(pageid) except Exception as e: print(f"Could not fetch title because of {e}") continue print(f"Got '{title}'") titles.append({"pageid": pageid, "title": title}) Knead(titles).write("data/reliwiki/church_extracted_titles.csv")
def process_rmm(): items = [] qids = { i["rmm"]: i["item"] for i in Knead("data/reliwiki/rmm-all.csv").data() } for path in iter_html(): with open(path["path"]) as f: matches = list(set(RMM_ID.findall(f.read()))) if len(matches) == 0: continue for rmm in matches: if rmm not in qids: print(f"No QID for pageid {path['id']} (RMM {rmm})") continue items.append({ "pageid": path["id"], "rmm": rmm, "qid": qids.get(rmm, None) }) Knead(items).write("data/reliwiki/rmm.csv", fieldnames=["pageid", "rmm", "qid"])
def propvalue(claim): claim = Knead(claim) return { "id": claim.query("mainsnak/datavalue/value/id").data(), "property": claim.query("mainsnak/property").data() }
def parse_page(path): print(f"Parsing {path}") json_path = Path(path).with_suffix(".json") if json_path.exists(): print(f"Got JSON file, returning that: {json_path}") return Knead(str(json_path)).data() with open(path) as f: soup = BeautifulSoup(f.read(), "lxml") # I guess the first table in mw-content-text is always the infobox table = soup.select_one("#mw-content-text table") if not table: print("Could not find table") return None infobox = { "coordinates": None, "pageid": Path(path).stem, "rijksmonument": [] } for tr in table.select('tr[valign="top"]'): td = tr.select("td") if len(td) < 2: continue key = td[0].get_text().strip() val = td[1].get_text().strip() infobox[key] = val # Extract external references for a in table.select("a"): href = a.get("href") if RMM_ID.match(href): matches = RMM_ID.findall(href) infobox["rijksmonument"].append(matches[0]) # Extract geocoordinates el_mapdata = soup.select_one("#map_leaflet_1 .mapdata") if el_mapdata: mapdata = json.loads(el_mapdata.get_text()) if "locations" in mapdata and len(mapdata["locations"]) > 0: loc = mapdata["locations"][0] infobox["coordinates"] = f"{loc['lat']},{loc['lon']}" infobox["rijksmonument"] = ",".join(infobox["rijksmonument"]) Knead(infobox).write(json_path) print(f"Written {json_path}") return infobox
def save(self): if Path(self.out_file).suffix == ".csv": Knead(self.results).write( self.out_file, fieldnames = self.reconciler.FIELDNAMES ) else: Knead(self.results).write(self.out_file) print(f"Written to '{self.out_file}'")
def create_csv(): items = [] json = Knead("data/sonneveld/kerkenkaart.json").data() for feature in json["features"]: prop = feature["properties"] coord = feature["geometry"]["coordinates"] prop["lat"] = coord[1] prop["lon"] = coord[0] items.append(prop) Knead(items).write("data/sonneveld/kerkenkaart.csv")
def main(): items = Knead(PATH + "/data/uds/monuments-with-qids.csv").data() skiplist = Skiplist("projects/skiplists/uds.txt") for index, item in enumerate(items): print(item) qid = item["qid"] bag = item["bag_ok"] url = item["url"] print() print(f"#{index} / #{len(items)}") print(f"Handling {qid} / {bag} / {url}") if skiplist.has(qid): print(f"{qid} in skiplist, skipping") continue wd_item = WikidataItem(qid) claims = wd_item.get_claims() if Props.BAG_BUILDING in claims: print("This item already has a BAG building ID, skipping") continue wd_item.add_string_claim( Props.BAG_BUILDING, bag, references=[ wd_item.get_item_claim(Props.STATED_IN, Items.UDS_DOC), wd_item.get_url_claim(Props.REF_URL, url), wd_item.get_item_claim(Props.LANGUAGE_WORK, Items.DUTCH) ]) skiplist.add(qid)
def parse_overviews(): items = [] for path in OVERVIEW_PATH.glob("*.html"): with open(path) as f: soup = BeautifulSoup(f.read(), "lxml") print(f"Parsing {path}") for row in soup.select("table.list tbody tr"): cells = row.select("td") idx = cells[0].select_one("a").get("href").replace( "detail.jsp?id=", "") items.append({ "id": idx, "stat_name": cells[0].select_one("a").get_text(), "handelsnaam": cells[1].select_one("a").get_text(), "plaats": cells[2].select_one("a").get_text() }) parsed_path = str(DATA_PATH / "overview.csv") Knead(items).write(parsed_path, fieldnames=["id", "stat_name", "handelsnaam", "plaats"])
def add_sites(): PATH = str(Path(__file__).parent) sites = Knead(PATH + "/data/zomergasten/guest-sites.csv").data() for site in sites: qid = site["qid"] url = site["url"] name = site["guest"] print() print(f"Now handling {qid} / {name}") item = WikidataItem(qid) claims = item.get_claims() if Props.OFFICIAL_WEBSITE in claims: print("Already got a site, skip") continue item.add_url_claim( Props.OFFICIAL_WEBSITE, url, qualifiers=[item.get_item_claim(Props.LANGUAGE_WORK, Items.DUTCH)], references=[ item.get_claim(Props.RETRIEVED, wbtime_now()), item.get_url_claim( Props.REF_URL, "https://www.vpro.nl/programmas/zomergasten/a-z.html"), item.get_item_claim(Props.LANGUAGE_WORK, Items.DUTCH) ])
def __init__(self, botid, datapath, key="id", required_fields=[], empty_check=lambda x: x == None): print(f"Setting up new bot '{botid}'") print(f"Data path: {datapath}") # Parse command line arguments and play it safe, assume # run_once and dry_run by default, except when they're # disabled args = pywikibot.handle_args() run_once = "-run-all" not in args dry_run = "-run-live" not in args print(f"Running once? {run_once}") print(f"Dry run? {dry_run}") self.id = botid self.run_once = run_once self.dry_run = dry_run self.skiplist = Skiplist(f"projects/skiplists/{self.id}.txt") self.key = key self.current_job = None self.data = Knead(datapath).data() self.required_fields = required_fields self.empty_check = empty_check
def dataknead_newlines(): Knead("input/entity.json")\ .query("entities/Q184843/sitelinks")\ .values()\ .map("title")\ .filter(lambda t:t != "Blade Runner")\ .write("output/sitelinks-other-title.csv", fieldnames=["title"])
def parse_pages(html_path): items = [] for path in Path(html_path).glob("*.html"): print() print(f"Scraping {path}") with open(path) as f: soup = BeautifulSoup(f.read(), "lxml") url = soup.select_one('[rel="canonical"]').get("href") description = soup.select_one('.commons-file-information-table div.description') description = description.get_text().replace("Nederlands: ", "").strip() geolink = soup.select_one('[href*="wikimap.toolforge.org"]').get("href") geolink = parse_urlargs(geolink) items.append({ "url" : url, "name" : None, "image" : url.replace("https://commons.wikimedia.org/wiki/", ""), "inscription" : description, "lat" : geolink["lat"], "lon" : geolink["lon"], "location" : None, "location_qid" : None, "inception" : None, "street" : None, "street_qid" : None, "street_nr" : None, "url" : None }) filename = Path(html_path).stem + "-parsed.csv" out_path = str(Path(html_path).parent / filename) Knead(items).write(out_path)
def parse(): data = [] for path in Path(f"{BASE}/html/").glob("*.html"): print(f"Parsing {path}") kid = path.stem with open(path) as f: soup = BeautifulSoup(f, "lxml") year_el = soup.select_one('a[href^="jaartal"]') if not year_el: year = None else: year = year_el.get_text() print(year) # Get ownership owner = None for label in soup.select(".lbl"): if label.get_text() == "eigendom van:": owner = label.parent.select_one(".val").get_text() data.append({"id": kid, "year": year, "owner": owner}) Knead(data).write(f"{BASE}/scraped-data.csv")
def __init__(self, botid, datapath=None, sparql=None, run_once=False, qid_key="qid", empty_check=lambda x: x == None or x == "", precheck_data=lambda x: True): print(f"Setting up new bot '{botid}'") if (not datapath) and (not sparql): raise Error("No datapath and no sparql") # Parse command line arguments and play it safe, assume # run_once by default, except when they're # disabled args = pywikibot.handle_args() run_once = "-run-all" not in args print(f"Running once? {run_once}") self.id = botid self.run_once = run_once self.qid_key = qid_key self.empty_check = empty_check self.precheck_data = precheck_data self.skiplist = Skiplist(f"projects/skiplists/{self.id}.txt") if datapath: self.data = Knead(datapath, has_header=True).data() elif sparql: query = Query(sparql) self.data = list(query.iter_results())
def scrape(): churches = [] with open("data/churchseats/seats.html") as f: soup = BeautifulSoup(f.read(), "lxml") for table in soup.select(".wikitable"): for row in table.select("tr"): cells = row.select("td") if len(cells) < 4: continue name = cells[1] seats = cells[2] name_anchor = name.select_one("a") if not name_anchor: continue if not name_anchor.get("href").startswith("/wiki"): continue churches.append({ "name": name_anchor.get("title"), "href": name_anchor.get("href"), "seats": get_number(seats.get_text()), "reference": get_reference(seats, soup) }) Knead(churches).write("data/churchseats/seats.csv", fieldnames=["name", "href", "seats", "reference"])
def run(self): if self.input_format == "emlxml": results = self.parse_xmls() if self.add_percentages: results = self.add_percentages(results) Knead(results).write(self.output_path, fieldnames=self.fields)
def main(): results = [] file_number = 1 for path in Path(".").glob("download_data/*.xml"): data = load_xml(path) records = data["OAI-PMH"]["ListRecords"]["record"] records = [parse(r) for r in records] results = results + records chunks = [ results[i:i + BATCH_SIZE] for i in range(0, len(results), BATCH_SIZE) ] for index, chunk in enumerate(chunks): Knead(chunk).write(f"results-{str(index).zfill(5)}.csv") Knead(results).write("results.json")
def parse_all_items(): items = [] for path in (DATA_PATH / "html").glob("item-*.html"): with open(path) as f: item = parse_item(f.read()) items.append(item) Knead(items).write(DATA_PATH / "items.json", indent=4)
def scrape_pages(): churches = [] api = AllPages(API_ENDPOINT) for page in api.iterate_pages(): print(page["title"]) churches.append(page) Knead(churches).write("data/reliwiki/pages.json")
def parse_all_overviews(): items = [] for path in (DATA_PATH / "html").glob("*.html"): with open(path) as f: items = items + parse_overview(f.read()) Knead(items).write(DATA_PATH / "overview.csv", fieldnames=["artist", "title", "href"])
def process_pages(): churches = [] for path in Path("data/reliwiki/html/").glob("*.html"): data = parse_page(path) if data: churches.append(data) Knead(churches).write("data/reliwiki/churches_data.json")
def get_pages_by_csv(csv_path): stem = Path(csv_path).stem html_path = BASE_PATH / stem print("Saving to: " + str(html_path)) html_path.mkdir(exist_ok = True) for item in Knead(csv_path).data(): get_page_by_id(item["pid"], html_path)
class Datasheet: def __init__(self, path, index): self.path = path self.data = Knead(path, has_header=True).data() self.keys = {i[index]: i for i in self.data} def __getitem__(self, key): if key in self.keys: return self.keys[key] else: return None def append(self, row): self.data.append(row) self.save() def save(self): print("Saving") Knead(self.data).write(self.path)
def transform(): def parse(item): d = item["properties"] return { "title": d["KunstwerkN"], "creator": d["Kunstenaar"], "location": d["LokatieBee"], "lat": d["Breedtegra"].replace(",", "."), "lon": d["Lengtegraa"].replace(",", "."), "id": d["KunstwerkI"], "url": d["Websitever"] } k = Knead("data/kos-nijmegen/kos.json").apply(lambda f: f["features"]).map( parse) k.write( "data/kos-nijmegen/kos-parsed.csv", fieldnames=["id", "title", "creator", "location", "lat", "lon", "url"])
def _create_lookup_table(self): lookup = [] json_path = str(Path(f"{self.data_path}/*.json")) logging.debug(f"Getting all data files from {json_path}") for path in glob(json_path): logging.debug(f"Parsing {path}") item = Knead(path).data() qid = item["id"] labels = self._get_all_labels(item) self.qid_count += 1 for label in labels: lookup.append([label, qid]) self.label_count = len(lookup) logging.debug(f"Found {self.label_count} labels") logging.debug(f"Writing lookup table to {self.lookup_path}") Knead(lookup).write(self.lookup_path)
def parse_json_items(): qid_artists = Datasheet(str(DATA_PATH / "qid-artists.csv"), "label") qid_collections = Datasheet(str(DATA_PATH / "qid-collections.csv"), "label") items = [] def get_qid(datasheet, key): if datasheet[key]: return datasheet[key]["qid"] else: return None for item in Knead(str(DATA_PATH / "items.json")).data(): print(item.get("title", None)) year = None if item["jaar"].isdigit(): year = int(item["jaar"]) collection_label = item.get("collectie", None) artist_label = item.get("artist", None) items.append({ "inventory_nr": item["objectnummer"], "title": item.get("title", None), "year": year, "url": item["href"], "artist_label": artist_label, "artist_qid": get_qid(qid_artists, artist_label), "collection_label": collection_label, "collection_qid": get_qid(qid_collections, collection_label) }) Knead(items).write(str(DATA_PATH / "items2.csv"))
def __init__(self, path, data_path, lookup_path, key=None): logger.debug(f"Importing {path} to {data_path}") self.data_path = data_path self.key = key self.label_count = 0 self.lookup_path = lookup_path self.path = path self.qid_count = 0 self.qids = Knead(path, has_header=self.key is not None).map(self._cleanup).data() logger.debug(f"Found {len(self.qids)} ids")
def _create_lookup(self): lookup = {} for row in Knead(self.lookup_path).data(): label = row[0] qid = row[1] if label in lookup: lookup[label].append(qid) else: lookup[label] = [qid] return lookup
def get_all(): churches = [] for index, page in enumerate(pages): pid = page["id"] qid = page.get("q", None) title = page["title"] print(f"#{index} '{title}'' ({pid} / {qid})") path = DATA_PATH / f"{pid}.json" ibox = Infobox("nl", title) if path.exists(): print(f"{path} exists, using that") apidata = Knead(str(path), read_as="json").data() data = ibox.get_data(apidata) else: data = ibox.get_data() # Write cached data print(f"Writing cache data to {path}") Knead(ibox.apidata).write(path.resolve()) boxes = data["infoboxes"] if len(boxes) == 0: print("No infobox") continue if len(boxes) > 1: print("More than one infobox, picking the first") churches.append({ "pid": pid, "qid": qid, "title": title, "data": boxes[0].box }) Knead(churches).write("parsed.json")
def _get_by_namespace(self, tree, namespace): items = Knead(tree).filter(lambda i:i["@id"].startswith(namespace.uri)) return items.data()