def date_from_match(self, offset, match): if match.group(1 + offset): # DD MONTH YYYY format date = sling.Date(int(match.group(3 + offset)), self.months[match.group(2 + offset)], int(match.group(1 + offset))) else: # MONTH DD, YYYY format""" date = sling.Date(int(match.group(6 + offset)), self.months[match.group(4 + offset)], int(match.group(5 + offset))) return date
def for_item(self, item, prop, value, store=None): assert isinstance(value, sling.Frame) if isinstance(prop, sling.Frame): prop = [prop] else: prop = list(prop) if store is None: store = sling.Store(self.kb) # Compute existing facts without any backoff. exact_facts = self._existing_facts(store, item, prop, False) if len(exact_facts) == 0: return (FactMatchType.NEW, item) if value in exact_facts: return (FactMatchType.EXACT, item) # For date-valued properties, existing dates could be int or string # (which won't match 'value', which is a sling.Frame). For them, we do a # more elaborate matching procedure. if self._date_valued(prop[-1]): proposed_date = sling.Date(value) existing_dates = [sling.Date(e) for e in exact_facts] for e in existing_dates: if e.value() == proposed_date.value(): return (FactMatchType.EXACT, item) # Check whether the proposed fact subsumes an existing fact. closure_facts = self._existing_facts(store, item, prop, True) if value in closure_facts: return (FactMatchType.SUBSUMES_EXISTING, item) # Check whether the proposed fact is subsumed by an existing fact. # Again, dates require special treatment. if self._date_valued(prop[-1]): for e in existing_dates: if self._finer_date(proposed_date, e): return (FactMatchType.SUBSUMED_BY_EXISTING, (item, e)) else: for existing in exact_facts: if isinstance(existing, sling.Frame): if self.subsumes(store, prop[-1], existing, value): return (FactMatchType.SUBSUMED_BY_EXISTING, (item, existing)) # Check for conflicts in case of unique-valued properties. if len(prop) == 1 and prop[0] in self.unique_properties: return (FactMatchType.CONFLICT, (item, exact_facts[0])) # Proposed fact is an additional one. Report the existing fanout. return (FactMatchType.ADDITIONAL, (item, len(exact_facts)))
def precise_date(self, dates): if dates is None: return False first = True for date in dates: if not first: return True # more than one date - don't try to fix if date is not None and sling.Date(date).precision > sling.YEAR: return True first = False return False
def store_records(self, records, batch_size=3): updated = 0 recno = 0 for item_str, record in records: recno += 1 if recno < flags.arg.first: print "Skipping record number", recno continue if recno > flags.arg.last: break if updated >= batch_size: print "Hit batch size of", batch_size break print "Processing", item_str fact_record = self.rs.parse(record) item = fact_record[self.n_item] facts = fact_record[self.n_facts] provenance = fact_record[self.n_provenance] if self.rs[item_str] != item: self.log_status_skip(item, facts, "inconsistent input") continue # read next record in the file wd_item = pywikibot.ItemPage(self.repo, item_str) if wd_item.isRedirectPage(): self.log_status_skip(item, facts, "redirect page") continue wd_claims = wd_item.get().get('claims') # Process facts / claims for prop, val in facts: prop_str = str(prop) fact = self.rs.frame({prop: val}) if prop_str in wd_claims: self.log_status_skip(item, fact, "already has property") continue if self.ever_had_prop(wd_item, prop_str): self.log_status_skip(item, fact, "already had property") continue claim = pywikibot.Claim(self.repo, prop_str) if claim.type == "time": date = sling.Date(val) # parse date from record precision = precision_map[date.precision] # sling to wikidata target = pywikibot.WbTime(year=date.year, precision=precision) elif claim.type == 'wikibase-item': target = pywikibot.ItemPage(self.repo, val) else: # TODO add location and possibly other types print "Error: Unknown claim type", claim.type continue claim.setTarget(target) cat_str = str(provenance[self.n_category]) summary = provenance[self.n_method] + " " + cat_str wd_item.addClaim(claim, summary=summary) rev_id = str(wd_item.latest_revision_id) claim.addSources(self.get_sources(cat_str)) self.log_status_stored(item, fact, rev_id) updated += 1 print item, recno print "Last record:", recno, "Total:", updated, "records updated."
def get_name(self, x): """Return name for given wikidata id.""" if x is None: return None if isinstance(x, int): date = sling.Date(x) return self.cal.str(date) elif not x.startswith("Q"): return None if isinstance(self.kb[x].name, bytes): return self.kb[x].name.decode("utf-8", errors="ignore") else: return self.kb[x].name
def process_log_data(self, files): no_of_files = len(files) file_no = 0 rs = sling.Store(self.store) skipped = 0 updated = 0 errors = 0 deleted = 0 changed = 0 for r_file in files: file_no += 1 print "Processing file {:4d} of {} ({})".format(file_no, no_of_files, r_file) reader = sling.RecordReader(r_file) for item_str, record in reader: rec = rs.parse(record) status = rec[self.n_status] if self.n_skipped in status: skipped += 1 continue elif self.n_revision not in status: print "ERROR - unknown status" errors += 1 continue updated += 1 wd_item = pywikibot.ItemPage(self.repo, item_str) wd_claims = wd_item.get().get('claims') facts = rec[self.n_facts] for prop, val in facts: p_claims = wd_claims.get(str(prop), []) if not p_claims: deleted += 1 continue for wd_claim in p_claims: if wd_claim.type == "time": date = sling.Date(val) # parse date from record precision = precision_map[date.precision] # sling to wikidata target = pywikibot.WbTime(year=date.year, precision=precision) elif wd_claim.type == 'wikibase-item': target = pywikibot.ItemPage(self.repo, val) else: # TODO add location and possibly other types print "Error: Unknown claim type", claim.type continue if not wd_claim.target_equals(target): changed += 1 reader.close() print skipped, "skipped,", updated, "updated,", deleted, "deleted,", \ changed, "changed,", errors, "error records in file" print "Done processing last file"
def match_type(self, store, prop, existing, proposed): existing = [self.kb.resolve(e) for e in existing] proposed = self.kb.resolve(proposed) if len(existing) == 0: return FactMatchType.NEW exact = False subsumes = False subsumed = False # For date-valued properties, existing dates could be int or string # (which won't match 'proposed', which could be a sling.Frame). # For them, we do a more elaborate matching procedure. if self._date_valued(prop): existing_dates = [sling.Date(e) for e in existing] proposed_date = sling.Date(proposed) for e in existing_dates: exact |= e.value() == proposed_date.value() subsumes |= self._finer_date(e, proposed_date) subsumed |= self._finer_date(proposed_date, e) else: closure_property = self.closure_properties.get(prop, None) for e in existing: exact |= e == proposed if isinstance(e, sling.Frame): subsumes |= self.subsumes(store, closure_property, proposed, e) subsumed |= self.subsumes(store, closure_property, e, proposed) if exact: return FactMatchType.EXACT if subsumes: return FactMatchType.SUBSUMES_EXISTING if subsumed: return FactMatchType.SUBSUMED_BY_EXISTING if prop in self.unique_properties: return FactMatchType.CONFLICT return FactMatchType.ADDITIONAL
def find_inceptions(self, inc_cats): self.out_file = "data/e/wikibot/inc-dates.rec" record_file = sling.RecordWriter(self.out_file) records = 0 store = sling.Store(self.kb) types = {} for item in self.kb: if self.wikimedia_category in item(self.instanceof): continue if self.human in item(self.instanceof): continue if not self.is_org(item): continue name = item.name if name is not None and name.startswith("Category:"): continue if item[self.inception] is not None: continue cat_dates = [] # Collect all the item's inception categories in cat_dates for cat in item(self.item_category): cat_inc_date = inc_cats.get(cat) if cat_inc_date is None: continue cat_dates.append((cat, cat_inc_date)) if not cat_dates: continue # no inception categories found for item msd = self.most_specific_date(cat_dates) if msd is None: continue (inc_cat, inc_date) = msd records += 1 facts = store.frame({self.inception: sling.Date(inc_date).value()}) provenance = store.frame({ self.category: inc_cat, self.method: "Member of an inception category, '" + inc_cat.name + "'" }) fact = store.frame({ self.item: item, self.facts: facts, self.provenance: provenance }) record_file.write(item.id, fact.data(binary=True)) record_file.close() print records, "inception date records written to file:", self.out_file print self.conflicts, "conflicts encountered"
def find_births(self, birth_cats): self.out_file = "local/data/e/wikibot/birth-dates.rec" record_file = sling.RecordWriter(self.out_file) records = 0 for item in self.kb: if self.human not in item(self.instanceof): continue if item[self.date_of_birth] is not None: continue cat_dates = [] # Collect all the item's birth categories in cat_dates for cat in item(self.item_category): cat_birth_date = birth_cats.get(cat) if cat_birth_date is None: continue cat_dates.append((cat, cat_birth_date)) if not cat_dates: continue # no birth categories found for item msd = self.most_specific_date(cat_dates) if msd is None: continue (birth_cat, birth_date) = msd records += 1 store = sling.Store(self.kb) facts = store.frame({ self.date_of_birth: self.calendar.value(sling.Date(birth_date)) }) provenance = store.frame({ self.category: birth_cat, self.method: "Member of a birth category, '" + birth_cat.name + "'" }) fact = store.frame({ self.item: item, self.facts: facts, self.provenance: provenance }) record_file.write(item.id, fact.data(binary=True)) record_file.close() print records, "birth date records written to file:", self.out_file print self.conflicts, "conflicts encountered"
def same_year(self, year, dates): if dates is None: return False for date in dates: if date and year == sling.Date(date).year: return True return False
relationship = rec[x_relationship] start = relationship[x_start_node] end = relationship[x_end_node] start_lei = start[x_node_id] end_lei = end[x_node_id] reltype = relationship[x_relationship_type] starttime = None endtime = None periods = relationship[x_relationship_periods] if periods != None: for period in periods(x_relationship_period): if period[x_period_type] == "RELATIONSHIP_PERIOD": period_start = period[x_start_date] period_end = period[x_end_date] if period_start: starttime = sling.Date(period_start).value() if period_end: endtime = sling.Date(period_end).value() # Dertermine relationship type. if reltype == "IS_ULTIMATELY_CONSOLIDATED_BY": parent_rel = n_owned_by child_rel = n_owner_of elif reltype == "IS_DIRECTLY_CONSOLIDATED_BY": parent_rel = n_parent child_rel = n_subsidiary else: continue # Get related organizations. subsidiary = store["P1278/" + start_lei] if subsidiary.isglobal():
def store_records(self, records, batch_size=3): updated = 0 recno = 0 for item_bytes, record in records: item_str = item_bytes.decode() recno += 1 if recno < flags.arg.first: print("Skipping record number", recno) continue if recno > flags.arg.last: break if updated >= batch_size: print("Hit batch size of", batch_size) break print("Processing https://www.wikidata.org/wiki/" + item_str) fact_record = self.rs.parse(record) item = fact_record[self.n_item] facts = fact_record[self.n_facts] provenance = fact_record[self.n_provenance] if self.rs[item_str] != item: self.log_status_skip(item, facts, "inconsistent input") continue # read next record in the file wd_item = pywikibot.ItemPage(self.repo, item_str) if not wd_item.exists(): self.log_status_skip(item, facts, "page does not exist") continue if wd_item.isRedirectPage(): self.log_status_skip(item, facts, "redirect page") continue try: wd_item.get() wd_claims = wd_item.claims except: self.log_status_skip(item, facts, "exception getting claims") continue # Process facts / claims for prop, val in facts: prop_str = str(prop) fact = self.rs.frame({prop: val}) claim = pywikibot.Claim(self.repo, prop_str) if prop in self.uniq_prop: if prop_str not in wd_claims: if self.ever_had_prop(wd_item, prop_str): self.log_status_skip(item, fact, "already had property") continue if claim.type == "time": date = sling.Date(val) # parse date from val target = self.get_wbtime(date) if target is None: self.log_status_skip(item, facts, "date precision exception") continue if prop_str in wd_claims: if len(wd_claims[prop_str] ) > 1: # more than one property already self.log_status_skip( item, fact, "has property more than once") continue old = wd_claims[prop_str][0].getTarget() if old is not None: if old.precision >= target.precision: err_str = "precise date already exists" self.log_status_skip(item, fact, err_str) continue if old.year != date.year: self.log_status_skip( item, fact, "conflicting year in date") continue if old.precision >= pywikibot.WbTime.PRECISION['month'] and \ old.month != date.month: self.log_status_skip( item, fact, "conflicting month in date") continue # Item already has property with a same year less precise date. # Ensure sources are all WP or empty if not self.all_WP( wd_claims[prop_str][0].getSources()): self.log_status_skip( item, fact, "date with non-WP source(s)") continue wd_item.removeClaims(wd_claims[prop_str]) elif claim.type == 'wikibase-item': if prop_str in wd_claims: self.log_status_skip(item, fact, "already has property") continue target = pywikibot.ItemPage(self.repo, val) else: # TODO add location and possibly other types print("Error: Unknown claim type", claim.type) continue else: # property not unique if claim.type == 'wikibase-item': target = pywikibot.ItemPage(self.repo, val.id) elif claim.type == "time": target = self.get_wbtime(val) if target is None: self.log_status_skip(item, facts, "date precision exception") continue else: # TODO add location and possibly other types print("Error: Unknown claim type", claim.type) continue if prop_str in wd_claims: old_fact = False for clm in wd_claims[prop_str]: if clm.target_equals(target): self.log_status_skip(item, fact, "already has fact") old_fact = True if old_fact: continue if provenance[self.n_category]: s = str(provenance[self.n_category]) sources = self.get_sources(item, s) elif provenance[self.n_url]: s = str(provenance[self.n_url]) sources = self.get_wp_sources() else: continue summary = provenance[self.n_method] + " " + s claim.setTarget(target) wd_item.addClaim(claim, summary=summary) rev_id = str(wd_item.latest_revision_id) if len(sources) > 0: claim.addSources(sources) self.log_status_stored(item, fact, rev_id) updated += 1 print(item, recno) print("Last record:", recno, "Total:", updated, "records updated.")
def store_records(self, records, batch_size=3): updated = 0 recno = 0 for item_str, record in records: recno += 1 if recno < flags.arg.first: print "Skipping record number", recno continue if recno > flags.arg.last: break if updated >= batch_size: print "Hit batch size of", batch_size break print "Processing https://www.wikidata.org/wiki/" + item_str fact_record = self.rs.parse(record) item = fact_record[self.n_item] facts = fact_record[self.n_facts] provenance = fact_record[self.n_provenance] if self.rs[item_str] != item: self.log_status_skip(item, facts, "inconsistent input") continue # read next record in the file wd_item = pywikibot.ItemPage(self.repo, item_str) if not wd_item.exists(): self.log_status_skip(item, facts, "page does not exist") continue if wd_item.isRedirectPage(): self.log_status_skip(item, facts, "redirect page") continue try: wd_item.get() wd_claims = wd_item.claims except: self.log_status_skip(item, facts, "exception getting claims") continue # Process facts / claims for prop, val in facts: prop_str = str(prop) fact = self.rs.frame({prop: val}) if prop_str not in wd_claims and self.ever_had_prop( wd_item, prop_str): self.log_status_skip(item, fact, "already had property") continue claim = pywikibot.Claim(self.repo, prop_str) if claim.type == "time": date = sling.Date(val) # parse date from record precision = precision_map[ date.precision] # sling to wikidata if date.precision <= sling.YEAR: target = pywikibot.WbTime(year=date.year, precision=precision) elif date.precision == sling.MONTH: target = pywikibot.WbTime(year=date.year, month=date.month, precision=precision) elif date.precision == sling.DAY: target = pywikibot.WbTime(year=date.year, month=date.month, day=date.day, precision=precision) else: self.log_status_skip(item, facts, "date precision exception") continue if prop_str in wd_claims: if len(wd_claims[prop_str] ) > 1: # more than one property already self.log_status_skip( item, fact, "has property more than once") continue old = wd_claims[prop_str][0].getTarget() if old is not None: if old.precision >= precision: self.log_status_skip( item, fact, "precise date already exists") continue if old.year != date.year: self.log_status_skip( item, fact, "conflicting year in date") continue if old.precision >= pywikibot.WbTime.PRECISION['month'] and \ old.month != date.month: self.log_status_skip( item, fact, "conflicting month in date") continue # item already has property with a same year less precise date # check that sources are all WP or empty if not self.all_WP( wd_claims[prop_str][0].getSources()): self.log_status_skip( item, fact, "date with non-WP source(s)") continue wd_item.removeClaims(wd_claims[prop_str]) elif claim.type == 'wikibase-item': if prop_str in wd_claims: self.log_status_skip(item, fact, "already has property") continue target = pywikibot.ItemPage(self.repo, val) else: # TODO add location and possibly other types print "Error: Unknown claim type", claim.type continue if provenance[self.n_category]: s = str(provenance[self.n_category]) sources = self.get_sources(s) elif provenance[self.n_url]: s = str(provenance[self.n_url]) sources = self.get_wp_sources() else: continue summary = provenance[self.n_method] + " " + s claim.setTarget(target) wd_item.addClaim(claim, summary=summary) rev_id = str(wd_item.latest_revision_id) if len(sources) > 0: claim.addSources(sources) self.log_status_stored(item, fact, rev_id) updated += 1 print item, recno print "Last record:", recno, "Total:", updated, "records updated."
def process_log_data(self, files): no_of_files = len(files) file_no = 0 rs = sling.Store(self.store) skipped = 0 updated = 0 errors = 0 deleted = 0 changed = 0 redirected = 0 updates = {} for r_file in files: file_no += 1 print "Processing file {:4d} of {} ({})".format(file_no, no_of_files, r_file) print r_file reader = sling.RecordReader(r_file) last_updated = updated for item_str, record in reader: rec = rs.parse(record) status = rec[self.n_status] if self.n_skipped in status: skipped += 1 continue elif self.n_revision not in status: print "ERROR - unknown status" errors += 1 continue updated += 1 wd_item = pywikibot.ItemPage(self.repo, item_str) if wd_item.isRedirectPage(): redirected += 1 continue wd_claims = wd_item.get().get('claims') facts = rec[self.n_facts] for prop, val in facts: p_claims = wd_claims.get(str(prop), []) if not p_claims: deleted += 1 continue for wd_claim in p_claims: if wd_claim.type == "time": date = sling.Date(val) # parse date from record precision = precision_map[date.precision] # sling to wikidata target = pywikibot.WbTime(year=date.year, precision=precision) elif wd_claim.type == 'wikibase-item': target = pywikibot.ItemPage(self.repo, val) else: # TODO add location and possibly other types print "Error: Unknown claim type", claim.type continue if not wd_claim.target_equals(target): print item_str, target, wd_claim.target changed += 1 reader.close() print updated - last_updated f = r_file.split("-") date = int(f[1] + f[2] + f[3]) if date not in updates: updates[date] = 0 updates[date] += (updated - last_updated) print skipped, "skipped,", updated, "updated,", deleted, "deleted,", \ changed, "changed,", errors, "error records in file" print "Done processing last file" # Print number of accumulated updates over time first = min(updates) acc_upd = 0 d = datetime.date(first / 10000, (first % 10000) / 100, first % 100) while d <= datetime.date.today(): num = d.year * 10000 + d.month * 100 + d.day if num in updates: acc_upd += updates[num] print d.strftime("%Y-%m-%d") + "," + str(acc_upd) d += datetime.timedelta(days = 1)