def find_inceptions(self, inc_cats): self.out_file = "data/e/wikibot/inc-dates.rec" record_file = sling.RecordWriter(self.out_file) records = 0 store = sling.Store(self.kb) types = {} for item in self.kb: if self.wikimedia_category in item(self.instanceof): continue if self.human in item(self.instanceof): continue if not self.is_org(item): continue name = item.name if name is not None and name.startswith("Category:"): continue if item[self.inception] is not None: continue cat_dates = [] # Collect all the item's inception categories in cat_dates for cat in item(self.item_category): cat_inc_date = inc_cats.get(cat) if cat_inc_date is None: continue cat_dates.append((cat, cat_inc_date)) if not cat_dates: continue # no inception categories found for item msd = self.most_specific_date(cat_dates) if msd is None: continue (inc_cat, inc_date) = msd records += 1 facts = store.frame({self.inception: sling.Date(inc_date).value()}) provenance = store.frame({ self.category: inc_cat, self.method: "Member of an inception category, '" + inc_cat.name + "'" }) fact = store.frame({ self.item: item, self.facts: facts, self.provenance: provenance }) record_file.write(item.id, fact.data(binary=True)) record_file.close() print records, "inception date records written to file:", self.out_file print self.conflicts, "conflicts encountered"
def iter_mentions(self, wid_set: Set[str]=None, only_entity: bool=False, split_by: str=None) -> \ Iterable[Tuple[str, sling.Document, List[Tuple[str, int, int]]]]: assert split_by in {'sentence', None}, 'not supported split_by' split_by = {'sentence': 3, None: None}[split_by] for n, (doc_wid, doc_raw) in enumerate(self.corpus.input): doc_wid = str(doc_wid, 'utf-8') if wid_set is not None and doc_wid not in wid_set: continue store = sling.Store(self.commons) frame = store.parse(doc_raw) document = sling.Document(frame, store, self.docschema) sorted_mentions = sorted(document.mentions, key=lambda m: m.begin) tokens = [t.word for t in document.tokens] split_start = [0] + [ i for i, t in enumerate(document.tokens) if t.brk == split_by ] split_ind = 0 mentions: List[Tuple[str, int, int]] = [] for mention in sorted_mentions: while len(split_start ) > split_ind + 1 and mention.begin >= split_start[ split_ind + 1]: if len(mentions) > 0: yield ( doc_wid, tokens[split_start[split_ind]:split_start[split_ind + 1]], mentions) mentions = [] split_ind += 1 if len(split_start ) > split_ind + 1 and mention.end > split_start[ split_ind + 1]: # skip mentions beyond the boundary continue linked_entity = self.get_linked_entity(mention) if only_entity and (type(linked_entity) is not str or not linked_entity.startswith('Q')): continue mentions.append( (linked_entity, mention.begin - split_start[split_ind], mention.end - split_start[split_ind])) if len(mentions) > 0: yield (doc_wid, tokens[split_start[split_ind]:], mentions)
def load_kb(task): if type(task) is str: filename = task # assume filename else: filename = task.input("kb").name if filename in _kb_cache: log.info("Retrieving cached KB") return _kb_cache[filename] else: kb = sling.Store() kb.load(filename) log.info("Knowledge base read") kb.lockgc() kb.freeze() kb.unlockgc() log.info("Knowledge base frozen") _kb_cache[filename] = kb return kb
def get_media_files(): # Load knowledge base. kb = sling.Store() kb.load(flags.arg.kb) n_media = kb["/w/media"] n_role = kb["role"] n_target = kb["target"] p_media = kb["media"] # Find all properties for WikiCommons files. imageprops = set() for name, prop in kb["/w/entity"]: if name != n_role: continue; if prop[n_target] == n_media: imageprops.add(prop) # Find media files for all items. media = [] for item in kb: for n, v in item: if n in imageprops: # Add Wikimedia Commons url. v = kb.resolve(v) if type(v) == str: fn = v.replace(' ', '_') md5 = md5hash(fn) fn = fn.replace("?", "%3F") fn = fn.replace("+", "%2B") fn = fn.replace("&", "%26") url = "%s/%s/%s/%s" % (commons_base_url, md5[0], md5[0:2], fn) media.append(url) else: print("Bad media file name:", item.id, v) elif n == p_media: # Add media url. v = kb.resolve(v) if type(v) == str: media.append(v) else: print("Bad media url:", item.id, v) return media
def read(self, parses_filename): reader = sling.RecordReader(parses_filename) self.category_name_to_qid = {} # category name -> qid self.category_frame = {} # category qid -> frame self.full_signature_to_parse = defaultdict(list) # signature -> parse self.coarse_signature_to_parse = defaultdict(list) # signature -> parse store = sling.Store() for index, (qid, value) in enumerate(reader): if index > 0 and index % 20000 == 0: log.info("%d categories read" % index) frame = store.parse(value) self.category_name_to_qid[frame.name] = qid self.category_frame[qid] = frame for parse in frame("parse"): element = (qid, frame, parse) full_signature = util.full_parse_signature(parse) self.full_signature_to_parse[full_signature].append(element) coarse_signature = util.coarse_parse_signature(parse) self.coarse_signature_to_parse[coarse_signature].append(element)
def __init__(self, body=None, store=None, schema=None): if body != None: store = body.store() if store == None: store = sling.Store() if schema == None: schema = MeasureSchema(store) if body == None: body = schema.n_earth self.body = body self.schema = schema # Determine radius of globe and convert to metres. radius = self.body[self.schema.radius] if radius != None: self.radius = Quantity(radius.resolve(), schema).si().amount else: diameter = body[schema.diameter] if diameter != None: self.radius = Quantity(diameter.resolve(), schema).si().amount / 2 else: self.radius = 6371000 # radius of Earth in metres
def for_parses(self, category, store=None): if store is None: store = sling.Store(self.kb) items = category.members output = [] # ith entry = match stats for ith parse cache = {} # (pid, qid) -> match stats for parse in category("parse"): parse_stats = [] for span in parse.spans: key = (span.pids, span.qid) stats = None if key in cache: stats = cache[key] else: stats = self.for_items(items, span.pids, span.qid, store) cache[key] = stats parse_stats.append(stats) output.append(parse_stats) return output
def run(self, task): self.init(task) reader = sling.RecordReader(task.input("parses").name) writer = sling.RecordWriter(task.output("output").name) for key, value in reader: store = sling.Store(self.kb) category = store.parse(value) matches = self.matcher.for_parses(category, store, max_evidences=-1) frame_cache = {} # (pid, qid) -> frame containing their match statistics for parse, parse_match in zip(category("parse"), matches): for span, span_match in zip(parse.spans, parse_match): span_key = (span.pids, span.qid) if span_key not in frame_cache: match_frame = span_match.as_frame(store) frame_cache[span_key] = match_frame span["fact_matches"] = frame_cache[span_key] writer.write(key, category.data(binary=True)) task.increment("fact-matcher/categories-processed") reader.close() writer.close()
def load( record: str, load_tokens: bool = True, load_mentions: bool = True ) -> Iterable[Tuple[sling.nlp.document.Document, Tuple[int, str, str]]]: """load documents from a .rec file. Warning: this may take good amount of RAM space (each *.rec file is 5.3GB). """ for k, rec in sling.RecordReader(record): store = sling.Store(commons) # parse record into frame doc_frame = store.parse(rec) # instantiate a document #parsed_doc = sling.Document(doc_frame, store, DOCSCHEMA) parsed_doc = MyDocument(doc_frame, store, DOCSCHEMA, load_tokens=load_tokens, load_mentions=load_mentions) metadata = get_metadata(doc_frame) yield parsed_doc, metadata
def update_item(qid): # Fetch item revision from Wikidata. url = "%s?id=%s&format=json" % (flags.arg.wiki_fetch_url, qid) reply = wdsession.get(url) # Convert item to SLING format. store = sling.Store(commons) item, revision = wikiconv.convert_wikidata(store, reply.text) # Coalese strings. store.coalesce(flags.arg.string_buckets) # Save item in database. print(qid, revision) reply = dbsession.put(flags.arg.dburl + "/" + qid, data=item.data(binary=True), headers={ "Version": str(revision), "Mode": "ordered", }) reply.raise_for_status()
def find_births(self, birth_cats): self.out_file = "local/data/e/wikibot/birth-dates.rec" record_file = sling.RecordWriter(self.out_file) records = 0 for item in self.kb: if self.human not in item(self.instanceof): continue if item[self.date_of_birth] is not None: continue cat_dates = [] # Collect all the item's birth categories in cat_dates for cat in item(self.item_category): cat_birth_date = birth_cats.get(cat) if cat_birth_date is None: continue cat_dates.append((cat, cat_birth_date)) if not cat_dates: continue # no birth categories found for item msd = self.most_specific_date(cat_dates) if msd is None: continue (birth_cat, birth_date) = msd records += 1 store = sling.Store(self.kb) facts = store.frame({ self.date_of_birth: self.calendar.value(sling.Date(birth_date)) }) provenance = store.frame({ self.category: birth_cat, self.method: "Member of a birth category, '" + birth_cat.name + "'" }) fact = store.frame({ self.item: item, self.facts: facts, self.provenance: provenance }) record_file.write(item.id, fact.data(binary=True)) record_file.close() print records, "birth date records written to file:", self.out_file print self.conflicts, "conflicts encountered"
def build(self, commons_path, corpora_path): # Prepare lexical dictionaries. self.words = Lexicon(self.words_normalize_digits) self.suffix = Lexicon(self.words_normalize_digits, oov_item=None) # Initialize training corpus. corpora = Corpora(corpora_path, commons_path) # Collect word and affix lexicons. for document in corpora: for token in document.tokens: word = token.word self.words.add(word) for s in self.get_suffixes(word): assert type(s) is str self.suffix.add(s) print "Words:", self.words.size(), "items in lexicon, including OOV" print "Suffix:", self.suffix.size(), "items in lexicon" # Load common store, but not freeze it yet. We will add the action table # and cascade specification to it. self.commons_path = commons_path self.commons = sling.Store() self.commons.load(commons_path) schema = sling.DocumentSchema(self.commons) # Prepare action table and cascade. self._build_action_table(corpora) self.cascade = cascade.ShiftMarkCascade(self.actions) print self.cascade # Save cascade specification in commons. _ = self.cascade.as_frame(self.commons, delegate_cell_prefix="delegate") # Freeze the common store. self.commons.freeze() # Add feature specs. self._specify_features()
def parse(self, obj): if type(obj) is sling.Document: # Parser document. obj.update() self.parser.parse(obj.frame) obj.refresh_annotations() return obj elif type(obj) is sling.Frame: # Parse document frame and return parsed document. self.parser.parse(obj) return sling.Document(obj) else: # Create local store for new document. store = sling.Store(self.commons) # Tokenize text. doc = tokenize(str(obj), store=store, schema=self.schema) # Parse document. self.parser.parse(doc.frame) doc.refresh_annotations() return doc
def pruneTriples(triple_file, output_file): """ ******** This method needs to be called in a python env that has sling!!! ******* Input: 1. name of file with tab sepperated SRO triples 2. name of the output file Output: file containing triples where both entities have QCodes. """ t1 = time.time() # start the clock base_path = cfg['all']['base_path'] kb = sling.Store() kb.load(base_path + "local/data/e/wiki/kb.sling") names = sling.PhraseTable( kb, base_path + "local/data/e/wiki/en/phrase-table.repo") kb.freeze() print("* Sling KB loaded in %0.3fs." % (time.time() - t1)) verified_triples = [] with open(triple_file, "r") as inFile: tsvreader = csv.reader(inFile, delimiter="\t") for triple in tsvreader: entity1 = triple[0] relation = triple[1] entity2 = triple[2] score = triple[3] hasBoth, qcode1, qcode2 = confirmEntities(names, entity1, entity2) if (hasBoth): q_triple = (qcode1, relation, qcode2, score) verified_triples.append(q_triple) with open(output_file, "w") as outFile: writer = csv.writer(outFile, delimiter='\t') for t in verified_triples: s = t[0] # subject r = t[1] # relation o = t[2] # object c = t[3] # confidence score writer.writerow([s, r, o, c])
def run(self, task): self.init(task) max_parses = int(task.param("max_parses")) reader = sling.RecordReader(task.input("input").name) writer = sling.RecordWriter(task.output("output").name) for index, (key, value) in enumerate(reader): store = sling.Store(self.kb) category = store.parse(value) document = sling.Document(category.document) # Score each parse. parse_with_score = self.score(category) # Keep only the top-k parses. ranked_parses = sorted(parse_with_score, key=lambda x: -x[1]) if len(ranked_parses) > max_parses: dropped = len(ranked_parses) - max_parses ranked_parses = ranked_parses[0:max_parses] task.increment("parses-dropped", dropped) task.increment("categories-with-too-many-parses") # Compute signature for each parse and store it in the parse. for parse, _ in ranked_parses: tokens, span_signature = self.signature(document, parse) parse["signature"] = tokens for span in parse.spans: if span in span_signature: span["signature"] = span_signature[span] # Replace the current set of parses with the ranked list. del category["parse"] for parse, _ in ranked_parses: category.append("parse", parse) task.increment("parses-kept", len(ranked_parses)) writer.write(key, category.data(binary=True)) reader.close() writer.close()
def __init__(self, frame=None, store=None, schema=None, load_tokens=True, load_mentions=True): # Create store, frame, and schema if missing. if frame != None: store = frame.store() if store == None: store = sling.Store() if schema == None: schema = DocumentSchema(store) if frame == None: frame = store.frame([(schema.isa, schema.document)]) # Initialize document from frame. self.frame = frame self.schema = schema self._text = frame.get(schema.document_text, binary=True) self.tokens = [] self.mentions = [] self.themes = [] self.tokens_dirty = False self.mentions_dirty = False self.themes_dirty = False if load_tokens: # Get tokens. tokens = frame[schema.document_tokens] if tokens != None: for t in tokens: token = self.get_word(t, schema, self._text) self.tokens.append(token) if load_mentions: # Get mentions. for m in frame(schema.document_mention): mention = Mention(schema, m) self.mentions.append(mention)
def __init__(self, recordio, commons, schema=None, gold=False, loop=False): self.filename = recordio self.commons_owned = False if isinstance(commons, str): self.commons = sling.Store() self.commons.load(commons) self.commons_owned = True else: assert isinstance(commons, sling.Store) self.commons = commons if schema is None or self.commons_owned: schema = sling.DocumentSchema(self.commons) if self.commons_owned: self.commons.freeze() assert schema is not None self.schema = schema self.reader = sling.RecordReader(recordio) self.generator = None self.loop = loop self.generator = None self.set_gold(gold)
def __init__(self): # Initialize commons store with knowledge base. start = time.time() self.commons = sling.Store() self.commons.lockgc() self.commons.load(wikidir + "/kb.sling", snapshot=True) self.n_item_member = self.commons['/w/item/member'] self.n_instance_of = self.commons['P31'] self.n_wikimedia_category = self.commons['Q4167836'] self.n_subject = self.commons['subject'] self.extractor = sling.FactExtractor(self.commons) # Add category subject types. self.subjects = {} for subject, item in english_subject_types.iteritems(): self.subjects[subject] = self.commons[item] # Add properties for subjects. self.subject_properties = [] for p in subject_properties: self.subject_properties.append(self.commons[p]) self.commons.freeze() end = time.time() print end - start, "secs loading commons" # Load phrase table. # TODO(ringgaard): Load language-dependent phrase table. start = time.time() self.phrasetab = sling.PhraseTable(self.commons, wikidir + "/en/phrase-table.repo") end = time.time() print end - start, "secs loading phrase table" # Open category member database. self.member_db = sling.RecordDatabase(wikidir + "/wikipedia-members.rec")
def validate(commons, recordio_filename, output_recordio='', options=Options()): schema = None if not isinstance(commons, sling.Store): assert type(commons) is str filename = commons commons = sling.Store() commons.load(filename) schema = sling.DocumentSchema(commons) commons.freeze() else: schema = sling.DocumentSchema(commons) corpus = corpora.Corpora(recordio_filename, commons, schema) aggregate = Results(options) count = 0 writer = None written = 0 if output_recordio != '': writer = sling.RecordWriter(output_recordio) for document in corpus: results = _validate(count, document, options) aggregate.add(results) if not results.ok() and options.stop_on_first_bad_document: print("Stopping after first bad document as requested") break count += 1 if writer and results.ok(): writer.write('', document.frame.data(binary=True)) written += 1 if writer: writer.close() return aggregate, count, written
def __init__(self, frame=None, store=None, schema=None): # Create store, frame, and schema if missing. if frame != None: store = frame.store() if store == None: store = sling.Store() if schema == None: schema = DocumentSchema(store) if frame == None: frame = store.frame([(schema.isa, schema.document)]) # Initialize document from frame. self.frame = frame self.schema = schema self._text = frame[schema.document_text] self.tokens = [] self.mentions = [] self.themes = [] self.tokens_dirty = False self.mentions_dirty = False self.themes_dirty = False # Get tokens. tokens = frame[schema.document_tokens] if tokens != None: for t in tokens: token = Token(self, t, len(self.tokens)) self.tokens.append(token) # Get mentions. for m in frame(schema.document_mention): mention = Mention(schema, m) self.mentions.append(mention) # Get themes. for theme in frame(schema.document_theme): self.themes.append(theme)
def process_change(change): qid = change["title"] if qid.startswith("Property:"): qid = qid[9:] ts = change["timestamp"] kind = change["type"] if kind == "log" and change["log_action"] == "delete": # Delete item/property from database. try: print("[%d] %s DELETE" % (queue.qsize(), qid)) reply = dbsession.delete(flags.arg.dburl + "/" + qid) reply.raise_for_status() except Exception as e: print("DB delete error:", e) elif kind == "edit": revision = change["revision"]["new"] store = sling.Store(commons) item = None m = redir_pat.fullmatch(change["comment"]) redir = None if m != None: # Handle redirects by adding {=Q<old> +Q<new>} frames. redir = m.group(2) item = store.parse("{id: %s +%s}" % (qid, redir)) else: # Fetch item. again = True while again: again = False try: # Fetch item revision from Wikidata. url = "%s?id=%s&revision=%d&format=json" % ( flags.arg.wiki_fetch_url, qid, revision) reply = wdsession.get(url) if reply.status_code == 429: # Too many requests. print("throttle down...") time.sleep(30) again = True reply.raise_for_status() except Exception as e: print("Error fetching item:", e, ":", change) return # Convert item to SLING format. try: item, _ = wikiconv.convert_wikidata(store, reply.text) except Exception as e: print("Error converting item:", e, reply.text) return # Coalese strings. store.coalesce(flags.arg.string_buckets) # Save item in database. saved = False while not saved: try: reply = None reply = dbsession.put(flags.arg.dburl + "/" + qid, data=item.data(binary=True), headers={ "Version": str(revision), "Mode": "ordered", }) reply.raise_for_status() result = reply.headers["Result"] saved = True except Exception as e: print("DB error:", e, ":", reply.text if reply != None else "") time.sleep(30) if redir: print("[%d] %s REDIR %s" % (queue.qsize(), qid, redir)) else: print("[%d] %d %s %s (%s)" % (queue.qsize(), revision, item["id"], item["name"], result)) # Update checkpoint. global num_changes num_changes += 1 if flags.arg.checkpoint != None: if num_changes % flags.arg.checkpoint_interval == 0: dt = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(ts)) print("CHECKPOINT", ts, dt) with open(flags.arg.checkpoint, 'w') as ckpt: ckpt.write(str(ts)) sys.stdout.flush()
flags.define("--threads", help="number of thread for worker pool", default=10, type=int, metavar="NUM") flags.define("--qsize", help="queue size", default=1024, type=int, metavar="NUM") flags.parse() # Commons store for Wikidata converter. commons = sling.Store() wikiconv = sling.WikiConverter(commons) commons.freeze() # Global variables. dbsession = requests.Session() wdsession = requests.Session() redir_pat = re.compile("\/\* wbcreateredirect:\d+\|\|(Q\d+)\|(Q\d+) \*\/") num_changes = 0 # Fetch changed item and update database. def process_change(change): qid = change["title"] if qid.startswith("Property:"): qid = qid[9:] ts = change["timestamp"]
def run(self, task): # Get parameters. language = task.param("language") # Load knowledge base. log.info("Load knowledge base") kb = sling.Store() kb.load(task.input("kb").name) n_infobox = kb["/wp/infobox"] n_page_item = kb["/wp/page/item"] n_file = kb["/wp/info/file"] n_media = kb["/wp/media"] image_fields = [ (kb["/wp/info/image"], kb["/wp/info/caption"]), (kb["/wp/info/cover"], kb["/wp/info/caption"]), (kb["/wp/info/logo"], kb["/wp/info/logo_caption"]), (kb["/wp/info/photo"], kb["/wp/info/photo_caption"]), (kb["/wp/info/flag_image"], kb["/wp/info/flag_caption"]), ] p_media = kb["media"] p_id = kb["id"] p_is = kb["is"] p_imported_from = kb["P143"] p_media_legend = kb["P2096"] image_properties = [ kb["P18"], # image kb["P154"], # logo image kb["P41"], # flag image ] lang = kb["/lang/" + language] wikipedia_item = lang["/lang/wikilang/wikipedia"] docschema = sling.DocumentSchema(kb) kb.freeze() # Fetch media titles for Wikipedia from yesterday. log.info("Fetch local media titles") yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d") mediaurl = "https://dumps.wikimedia.org/other/mediatitles/%s/" \ "%swiki-%s-all-media-titles.gz" % (yesterday, language, yesterday) r = urllib.request.urlopen(mediaurl) mediatitles = set(gzip.decompress(r.read()).decode().split('\n')) task.increment("local_media_files", len(mediatitles)) # Open output file. fout = open(task.output("output").name, "w") # Process input articles. for res in task.inputs("input"): log.info("Extract media files from", res.name) for _, data in sling.RecordReader(res.name): # Read article into store. store = sling.Store(kb) doc = store.parse(data) task.increment("documents") # Find first infobox. infobox = None for theme in doc(docschema.document_theme): if theme.isa(n_infobox): infobox = theme break if infobox is None: continue task.increment("infoboxes") # Find images in infobox. imagelist = [] for n_image, n_caption in image_fields: image = infobox[n_image] caption = infobox[n_caption] if image is None: continue # Get image for repeated image field. if type(image) is sling.Frame: group = image image = group[n_file] caption = group[n_caption] if image is None: continue if "{" in image or "[" in image: # Structured annotations. annotations = sling.lex(image, store=store, schema=docschema) for theme in annotations.themes: if theme.isa(n_media): image = theme[p_is] if image is not None: imagelist.append((image, None)) task.increment("structured_annotations") else: # Image filename. imagelist.append((image, caption)) if len(imagelist) == 0: continue # Process list of images for item. known_images = 0 image_frames = [] item = doc[n_page_item] if item is None: continue for image, caption in imagelist: # Disregard direct URLs for now. if image.startswith("http://") or \ image.startswith("https://") or \ image.startswith("//"): task.increment("url_images") continue # Trim image name. Remove File: prefix. colon = image.find(':') if colon > 0 and colon < 10: image = image[colon + 1:] image = titlecase(image.strip()).replace('_', ' ') if len(image) == 0 or image in default_images: task.increment("empty_images") continue if image.endswith("‎"): image = image[:-5] frag = image.find('#') if frag > 0: image = image[:frag] image = html.unescape(image) image = urllib.parse.unquote(image) # Discard media files with unknown or ignored extensions. dot = image.rfind('.') ext = image[dot:].lower() if dot > 0 else None if ext in ignored_extensions: task.increment("ignored_image_format") continue if ext not in known_extensions: log.info("unknown format:", item.id, image) task.increment("unknown_image_format") continue # Get item from KB and check if image is already known. task.increment("images") known = False for prop in image_properties: for img in item(prop): img = kb.resolve(img) if img == image: known = True known_images += 1 if known: task.increment("known_images") continue task.increment("new_images") # Check if image is in local Wikipedia or Wikimedia Commons. fn = image.replace(' ', '_') if fn in mediatitles: urlbase = "https://upload.wikimedia.org/wikipedia/" + language task.increment("local_images") else: urlbase = "https://upload.wikimedia.org/wikipedia/commons" task.increment("commons_images") if known_images == 0: task.increment("commons_imaged_items") # Compute URL for image. md5 = md5hash(fn) fn = fn.replace("?", "%3F") fn = fn.replace("+", "%2B") fn = fn.replace("&", "%26") url = "%s/%s/%s/%s" % (urlbase, md5[0], md5[0:2], fn) # Create frame for item with media image. slots = [ (p_is, url), (p_imported_from, wikipedia_item), ] if caption != None: capdoc = sling.lex(caption, store=store, schema=docschema) captxt = capdoc.phrase(0, len(capdoc.tokens)) slots.append((p_media_legend, captxt)) image_frames.append(store.frame(slots)) # Create item frame with extra image info. if len(image_frames) == 0: continue slots = [(p_id, item.id)] for image_frame in image_frames: slots.append((p_media, image_frame)) frame = store.frame(slots) fout.write(frame.data(utf8=True)) fout.write("\n") if known_images == 0: task.increment("imaged_items") fout.close()
def run(self): month = "(" + "|".join(self.months.keys()) + ")" day = "(\d{1,2})" year = "(\d{4})" date = "(?:(?:" + day + " " + month + " " + year + ")|" date += "(?:" + month + " " + day + ", " + year + "))" date += "(?:[^)]+?)?" dates = date + u"\s*-+\s*" + date dates = u"(?:(?:(?:born|b\.|n\xe9e),? ([^0-9)]*?)" + date + \ "(?:(?:died|d\.),? [^0-9)]*?" + date + ")?)|(?:" + dates + "))" pat = "(?:[^(]|\([^0-9]*\))*?\([^0-9)]*?" + dates + "\s*\)" rec = re.compile(pat) self.out_file = "local/data/e/wikibot/birth-death-dates.rec" record_file = sling.RecordWriter(self.out_file) records = 0 store = sling.Store(self.kb) for i in range(10): i_file = "local/data/e/wiki/en/documents-0000" + str( i) + "-of-00010.rec" print i_file, records for (item_id, record) in sling.RecordReader(i_file): item = self.kb[item_id] if self.human not in item(self.instanceof): continue if self.precise_date(item(self.date_of_birth)) and \ self.precise_date(item(self.date_of_death)): continue parsed_record = sling.Store().parse(record) doc = sling.Document(parsed_record) raw_text = parsed_record['text'] if len(raw_text) == 0: continue start_index = raw_text.find("<b>") + len("<b>") first = 1 while first < len(doc.tokens) and \ doc.tokens[first].start <= start_index: first += 1 last = first while last < len(doc.tokens) and doc.tokens[last].brk < 3: last += 1 text = doc.phrase(max(0, first - 1), min(len(doc.tokens), last + 15)) m = rec.match(text) if m is None: continue if text.find("(baptised") >= 0 or text.find("throne") >= 0: continue if text.find("(baptized") >= 0 or text.find("partner") >= 0: continue if m.group(2) or m.group(5): first = self.date_from_match(1, m) if first.year < 1753: continue # possibly Julian calendar date if m.group(8) or m.group(11): second = self.date_from_match(7, m) if second.year < 1753: continue # possibly Julian calendar date facts = store.frame({ self.date_of_birth: first.value(), self.date_of_death: second.value() }) else: # Only one date match mg1 = m.group(1) dob = item(self.date_of_birth) dod = item(self.date_of_death) if mg1 and max(mg1.find("died"), mg1.find("d.")) >= 0: # death date only if self.precise_date(dod): continue if self.same_year(first.year, dob): continue # b&d too close facts = store.frame({ self.date_of_death: first.value(), }) else: # birth date only if self.precise_date(dob): continue if self.same_year(first.year, dod): continue # b&d too close facts = store.frame({ self.date_of_birth: first.value(), }) else: first = self.date_from_match(13, m) second = self.date_from_match(19, m) if min(first.year, second.year) < 1753: continue # possibly Julian facts = store.frame({ self.date_of_birth: first.value(), self.date_of_death: second.value() }) records += 1 provenance = store.frame({ self.url: parsed_record['url'], self.method: "English Wikipedia dates for '" + str(item.name) + "'" }) fact = store.frame({ self.item: item, self.facts: facts, self.provenance: provenance }) record_file.write(item.id, fact.data(binary=True)) record_file.close() print records, "birth/death date records written to file:", self.out_file
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Import FactGrid and convert to SLING format.""" import sling import gzip # Map ids into FactGrid namespace. def convert_id(idstr): if idstr[0] == "P" or idstr[0] == "Q": return "P8168/" + idstr; return idstr # Initialize commons store. commons = sling.Store() n_id = commons["id"] n_is = commons["is"] n_isa = commons["isa"] n_qid = commons["/w/qid"] n_pid = commons["P343"] n_property = commons["/w/property"] n_fg_item_id = commons["P8168"] wikiconv = sling.WikiConverter(commons, "en") commons.freeze() # Read all items from FactGrid dump. fin = gzip.open("data/c/wikidata/factgrid.json.gz") fitem = sling.RecordWriter("data/e/factgrid/factgrid-items.rec")
def compare(arg): base_reader = sling.RecordReader(arg.base) expt_reader = sling.RecordReader(arg.expt) commons = sling.Store() commons.load(arg.commons) schema = sling.DocumentSchema(commons) commons.freeze() store = sling.Store(commons) index = -1 for (_, base_val), (_, expt_val) in zip(base_reader, expt_reader): index += 1 base_doc = sling.Document(frame=store.parse(base_val), schema=schema) expt_doc = sling.Document(frame=store.parse(expt_val), schema=schema) # Basic checks. base = base_doc.frame["trace"] expt = expt_doc.frame["trace"] if base is None and expt_doc is not None: checker.error('No trace in base document at index %d' % index) elif base is not None and expt_doc is None: checker.error('No trace in expt document at index %d' % index) if base is None: continue # Traces should be over the same token range. checker = Checker(index, base_doc, expt_doc, arg.diff) checker.check_eq(base["begin"], expt["begin"], "Trace Begin") checker.check_eq(base["end"], expt["end"], "Trace End") # Check LSTM features. base_lstm = base["/trace/lstm_features"] expt_lstm = expt["/trace/lstm_features"] checker.check_eq(len(base_lstm), len(expt_lstm), "LSTM Features Length") for i in range(len(base_lstm)): checker.frame_eq(base_lstm[i], expt_lstm[i], \ "LSTM features for token %d (%s)" % (i, base_doc.tokens[i].word)) # Check steps. base_steps = base["/trace/steps"] expt_steps = expt["/trace/steps"] min_steps = min(len(base_steps), len(expt_steps)) for i in range(min_steps): message = "Step %d's current token index" % i checker.check_eq(base_steps[i]["/trace/current"], \ expt_steps[i]["/trace/current"], message) # Check FF features for the step. base_ff = base_steps[i]["/trace/ff_features"] expt_ff = expt_steps[i]["/trace/ff_features"] checker.check_eq(len(base_ff), len(expt_ff), \ "# of FF features for step %d" % i) base_dict = {f["/trace/feature"] : f["/trace/values"] for f in base_ff} expt_dict = {f["/trace/feature"] : f["/trace/values"] for f in expt_ff} for k, v in base_dict.items(): checker.check_eq(k in expt_dict, True, \ "Step %d: FF feature %s not in expt" % (i, k)) checker.check_eq(v, expt_dict[k], \ "Step %d: FF feature %s has a different value in expt" % (i, k)) for k, v in expt_dict.items(): checker.check_eq(k in base_dict, True, \ "Step %d: FF feature %s not in base" % (i, k)) # Check action(s) in the step. base_actions = base_steps[i]["/trace/actions"] expt_actions = expt_steps[i]["/trace/actions"] for idx in range(min(len(base_actions), len(expt_actions))): checker.frame_eq(base_actions[idx]["/trace/predicted"], \ expt_actions[idx]["/trace/predicted"], "Step %d, predicted action %d" % (i, idx), ["/trace/_str"]) checker.frame_eq(base_actions[idx]["/trace/final"], \ expt_actions[idx]["/trace/final"], "Step %d, final action %d" % (i, idx), ["/trace/_str"]) # There should be the same number of actions in the step. checker.check_eq(len(base_actions), len(expt_actions), \ "Step %d: # of actions" % i) # There should be the same number of steps. checker.check_eq(len(base_steps), len(expt_steps), "# of Steps") base_reader.close() expt_reader.close()
# # Fields: # 0: Registration Authority Code # 1: Country # 2: Country Code # 3: Jurisdiction (country or region) # 4: International name of Register # 5: Local name of Register # 6: International name of organisation responsible for the Register # 7: Local name of organisation responsible for the Register # 8: Website # 9: Date IP disclaimer # 10: Comments # 11: End Date kb = sling.Store() kb.load("data/e/kb/kb.sling") aliases = sling.PhraseTable(kb, "data/e/kb/en/phrase-table.repo") def resolve_name(name): for item in aliases.lookup(name): return item return None reader = csv.reader(open("data/c/lei/2019-12-05_ra-list-v1.5.csv", "r")) reader.__next__() for row in reader: slots = [("registration_authority_code", row[0]), ("country_name", row[1]),
def run(self, task): self.init(task) writer = sling.RecordWriter(task.output("output").name) rejected = sling.RecordWriter(task.output("rejected").name) inputs = [t.name for t in task.inputs("items")] for filename in inputs: reader = sling.RecordReader(filename) for index, (key, value) in enumerate(reader): store = sling.Store(self.kb) frame = store.parse(value) # Only process category items. if not self.is_category(frame): rejected.write(key, "not_category") continue # See if the category should be skipped. members = self.get_members(frame) reject, reason = self.reject(key, frame, members) if reject: task.increment("skipped_categories/" + reason) rejected.write(key, reason) continue # First, collect the targets of all facts of all category members. qp_counts = self.qid_pid_counts(store, members) # Next, tokenize the category title. title = self.get_title(frame) colon = title.find(':') title = title[colon + 1:] document = sling.tokenize(title, store) # Next, find matches for all spans. These are reported as a list, # where ith item = spans that begin at token i (possibly an empty list). begin_to_spans = self.compute_spans(document, qp_counts) # Construct maximal parses with non-overlapping spans. parses = self.construct_parses(begin_to_spans) # Post-process parses. parses = self.post_process(parses) if len(parses) == 0 or len(parses) == 1 and len( parses[0]) == 0: task.increment("skipped_categories/no_parses") rejected.write(key, "no_parses") continue # Write parses as frames. frame = store.frame({"name": title, "members": members}) frame["document"] = document.frame for parse in parses: span_array = store.array(len(parse)) for i, span in enumerate(parse): span_array[i] = store.frame({ "begin": span.begin, "end": span.end, "qid": span.qid, "prior": span.prior, "pids": list(span.pids), "count": span.count }) parse_frame = store.frame({"spans": span_array}) frame.append("parse", parse_frame) writer.write(key, frame.data(binary=True)) task.increment("categories_accepted") # Compute histogram over number of parses. for b in self.num_parses_bins: if len(parses) <= b: task.increment("#parses <= %d" % b) if self.num_parses_bins[-1] < len(parses): task.increment("#parses > %d" % self.num_parses_bins[-1]) reader.close() writer.close() rejected.close()
def test_fact_matcher(): RED = "\033[1;31m" GREEN = "\033[0;32m" RESET = "\033[0;0m" def error(entry, message): sys.stdout.write(RED) print "[FAILED] ", sys.stdout.write(RESET) print entry, ":", message def success(entry): sys.stdout.write(GREEN) print "[SUCCESS] ", sys.stdout.write(RESET) print entry kb = load_kb("local/data/e/wiki/kb.sling") extractor = sling.api.FactExtractor(kb) matcher = FactMatcher(kb, extractor) # Test cases. tuples = [] # Adds the given test case and its reverse test case too (if possible). def add(pid, existing, proposed, match_type): tuples.append((pid, existing, proposed, match_type)) # Add the reverse case. if match_type != FactMatchType.NEW and existing != proposed: rev_type = match_type if match_type == FactMatchType.SUBSUMED_BY_EXISTING: rev_type = FactMatchType.SUBSUMES_EXISTING if match_type == FactMatchType.SUBSUMES_EXISTING: rev_type = FactMatchType.SUBSUMED_BY_EXISTING tuples.append((pid, proposed, existing, rev_type)) # Place of birth, Kapiolani Medical Center, Honolulu. add("P19", "Q6366688", "Q18094", FactMatchType.SUBSUMES_EXISTING) # Place of birth, Kapiolani Medical Center, US. add("P19", "Q6366688", "Q30", FactMatchType.SUBSUMES_EXISTING) # Place of birth, <no existing value>, US. add("P19", "", "Q30", FactMatchType.NEW) # Place of birth, US, US. add("P19", "Q30", "Q30", FactMatchType.EXACT) # Place of birth, Honolulu, Chicago. add("P19", "Q18094", "Q1297", FactMatchType.CONFLICT) # Children, Malia Obama, Sasha Obama. add("P40", "Q15070044", "Q15070048", FactMatchType.ADDITIONAL) # Date-valued properties: int values. # Note: P585 = point in time (unique valued), P580 = start time (non unique) add("P585", 1961, 19610804, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", 1961, 196108, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", 1961, 1961, FactMatchType.EXACT) add("P585", 1961, 196, FactMatchType.SUBSUMES_EXISTING) # 196 = 196X (decade) add("P585", 1961, 19, FactMatchType.SUBSUMES_EXISTING) # 19 = 19XX (century) add("P585", 1961, 1, FactMatchType.SUBSUMES_EXISTING) # 1 = 1XXX (millenium) add("P585", 1962, 19610804, FactMatchType.CONFLICT) add("P585", 1962, 196108, FactMatchType.CONFLICT) add("P585", 1962, 1961, FactMatchType.CONFLICT) add("P580", 1961, 1962, FactMatchType.ADDITIONAL) # Date-valued properties: string values. add("P585", "1961", "1961-08-04", FactMatchType.SUBSUMED_BY_EXISTING) add("P585", "1961", "1961-08", FactMatchType.SUBSUMED_BY_EXISTING) add("P585", "1961", "1961", FactMatchType.EXACT) add("P585", "1961", "196*", FactMatchType.SUBSUMES_EXISTING) # decade add("P585", "1961", "19**", FactMatchType.SUBSUMES_EXISTING) # century add("P585", "1961", "1***", FactMatchType.SUBSUMES_EXISTING) # millenium add("P585", "1962", "1961-08-04", FactMatchType.CONFLICT) add("P585", "1962", "1961-08", FactMatchType.CONFLICT) add("P585", "1962", "1961", FactMatchType.CONFLICT) add("P580", "1961", "1962-08", FactMatchType.ADDITIONAL) # Date-valued properties: QID values. These are only available for years, # decades, and millenia. q1961 = "Q3696" q1962 = "Q2764" q196x = "Q35724" q197x = "Q35014" q19xx = "Q6927" q1xxx = "Q25860" add("P585", q196x, q1961, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q1xxx, q1961, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q1961, q1961, FactMatchType.EXACT) add("P585", q1961, q1962, FactMatchType.CONFLICT) add("P585", q196x, q197x, FactMatchType.CONFLICT) add("P585", q19xx, q197x, FactMatchType.SUBSUMED_BY_EXISTING) add("P580", q1961, q197x, FactMatchType.ADDITIONAL) # Date-valued properties: proposed and existing values have different types. add("P585", q1961, 1961, FactMatchType.EXACT) add("P585", q196x, 196, FactMatchType.EXACT) add("P585", q19xx, 19, FactMatchType.EXACT) add("P585", q1xxx, 1, FactMatchType.EXACT) add("P585", q196x, 1961, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q1961, 19610804, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q1961, 19, FactMatchType.SUBSUMES_EXISTING) add("P585", q1961, "1961", FactMatchType.EXACT) add("P585", q196x, "196*", FactMatchType.EXACT) add("P585", q19xx, "19**", FactMatchType.EXACT) add("P585", q196x, "1961", FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q196x, "1961-08-04", FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q1961, "196*", FactMatchType.SUBSUMES_EXISTING) add("P585", "", "196*", FactMatchType.NEW) add("P585", q1961, "1962", FactMatchType.CONFLICT) add("P585", 1963, "1962", FactMatchType.CONFLICT) add("P580", q1961, "1962", FactMatchType.ADDITIONAL) add("P580", 1963, "1962", FactMatchType.ADDITIONAL) # Genre, melodrama, drama. add("P136", "Q191489", "Q21010853", FactMatchType.SUBSUMES_EXISTING) # Genre, trip-hop, electronic music. add("P136", "Q205560", "Q9778", FactMatchType.SUBSUMES_EXISTING) # Genre, rock and roll, electronic music. add("P136", "Q7749", "Q9778", FactMatchType.ADDITIONAL) # Educated at, Harvard Law School, Harvard University. add("P69", "Q49122", "Q13371", FactMatchType.SUBSUMES_EXISTING) # Educated at, Harvard Law School, Yale University. add("P69", "Q49122", "Q49112", FactMatchType.ADDITIONAL) # Employer, Airbus, Airbus SE. add("P108", "Q67", "Q2311", FactMatchType.SUBSUMES_EXISTING) # Employer, Airbus, Boeing. add("P108", "Q67", "Q66", FactMatchType.ADDITIONAL) # Occupation, sports cyclist, cyclist. add("P106", "Q2309784", "Q2125610", FactMatchType.SUBSUMES_EXISTING) # Occupation, sports cyclist, cricketer. add("P106", "Q2309784", "Q12299841", FactMatchType.ADDITIONAL) store = sling.Store(kb) total_successes = 0 for entry in tuples: pid, existing, proposed, expected = entry if pid not in kb: error(entry, "%s not in KB" % pid) continue pid = kb[pid] if isinstance(existing, str) and existing != "" and existing in kb: existing = kb[existing] if isinstance(proposed, str) and proposed in kb: proposed = kb[proposed] if existing == "": existing = [] else: existing = [existing] actual = matcher.match_type(store, pid, existing, proposed) if actual == expected: success(entry) total_successes += 1 else: error(entry, "Got %s, but expected %s" % (actual.name, expected.name)) print "Total successful tests: %d out of %d" % (total_successes, len(tuples))
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Convert GLEIF register to SLING.""" import zipfile import csv import sling import sling.dataset.bizreg # Load KB. print("Loading KB") kb = sling.Store() kb.load("data/e/kb/kb.sling") n_id = kb["id"] n_is = kb["is"] n_isa = kb["isa"] n_name = kb["name"] n_instance_of = kb["P31"] n_country_code = kb["P297"] n_region_code = kb["P300"] n_organization = kb["Q43229"] n_opencorporates_id = kb["P1320"] n_country = kb["P17"] n_street_address = kb["P6375"] n_postal_code = kb["P281"] n_headquarters = kb["P159"]