def data_preparation(args): dataset = ['train', 'valid'] wiki_kb = {} src = {} tgt = {} arguments = "" for set_info in dataset: wiki_kb[set_info] = load_kb(os.path.join(args.data_dir, "{}.wiki".format(set_info)), verbose=False) src[set_info], tgt[set_info] = generate_tmp_file(os.path.join( args.data_dir, "{}.pm".format(set_info)), wiki_kb[set_info], verbose=False) arguments += "-{}_src {} -{}_tgt {} ".format(set_info, src[set_info], set_info, tgt[set_info]) script = "python {}/preprocess.py {} -save_data {} -src_seq_length 5000".format( OpenNMT_dir, arguments, args.data_out) if args.share_vocab: script += " -share_vocab" print(script) os.system(script) for set_info in dataset: unlink_tmp_file(src[set_info]) unlink_tmp_file(tgt[set_info])
def init(self, task): self.kb = load_kb(task) self.extractor = sling.api.FactExtractor(self.kb) self.h_name = self.kb['name'] # Custom type ordering for building a taxonomy. taxonomy_types = [ 'Q215627', # person 'Q95074', # fictional character 'Q729', # animal 'Q4164871', # position 'Q12737077', # occupation 'Q216353', # title 'Q618779', # award 'Q27020041', # sports season 'Q4438121', # sports organization 'Q215380', # band 'Q2385804', # educational institution 'Q783794', # company 'Q41710', # ethnic group (NEW TYPE) 'Q6256', # country (NEW TYPE) 'Q17334923', # location 'Q43229', # organization 'Q431289', # brand 'Q2188189', # musical work 'Q571', # book 'Q732577', # publication 'Q11424', # film 'Q15416', # television program 'Q12136', # disease 'Q16521', # taxon 'Q5058355', # cellular component 'Q7187', # gene 'Q11173', # chemical compound 'Q811430', # construction 'Q618123', # geographical object 'Q1656682', # event 'Q101352', # family name 'Q202444', # given name 'Q577', # year 'Q186081', # time interval 'Q11563', # number 'Q17376908', # languoid 'Q1047113', # specialty (REORDERED) 'Q968159', # art movement (NEW TYPE) 'Q483394', # genre (REORDERED) 'Q47574', # unit of measurement (REPLACES unit) 'Q39875001', # measure 'Q3695082', # sign 'Q2996394', # biological process 'Q11410', # game 'Q7397', # software 'Q838948', # work of art 'Q47461344', # written work 'Q28877', # goods 'Q15401930', # product 'Q121769', # reference 'Q1190554' # occurrence ] self.taxonomy = self.extractor.taxonomy(taxonomy_types)
def init(self, task): self.kb = load_kb(task) self.names = sling.PhraseTable(self.kb, task.input("phrase-table").name) self.min_members = int(task.param("min_members")) self.num_parses_bins = [1, 2, 3, 5, 10, 20, 50, 100, 200] # Lookup some handles in advance. self.h_language = self.lookup("/lang/" + task.param("language")) self.h_lang = self.lookup("lang") self.main_topic = self.lookup("P301") # present in topical categories self.h_member = self.lookup("/w/item/member") self.h_instanceof = self.lookup('P31') self.h_subclassof = self.lookup('P279') self.h_category = self.lookup('Q4167836') self.h_category_contains = self.lookup('P4224') self.english = task.param("language") == "en" # The following kinds of categories won't be processed. self.uninteresting_categories = set([ self.lookup('Q20769287'), # disambiguation category self.lookup('Q15407973'), # list category #self.lookup('Q56428020'), # template category self.lookup('Q23894233'), # stub category self.lookup('Q24046192'), # admin category self.lookup('Q15647814'), # user category self.lookup('Q20010800'), # user language category self.lookup('Q30432511'), # meta category self.lookup('Q13331174') # navbox category ]) # These pids will not be considered as resolution for spans. self.pids_to_ignore = set([ self.h_instanceof, # P31 = instance of self.lookup('P279'), # P279 = subclass of self.lookup('P971'), # P971 = category combines topics self.lookup('P4224'), # P4224 = category contains ]) # These QIDs will not be considered as resolutions for spans. self.base_qids = set([ self.lookup('Q5'), # human self.lookup('Q215627'), # person self.lookup('Q17334923'), # location self.lookup('Q811430'), # construction self.lookup('Q43229'), # organization self.lookup('Q2385804'), # educational institution self.lookup('Q294163'), # public institution self.lookup('Q15401930'), # product self.lookup('Q12737077'), # occupation self.lookup('Q192581'), # job self.lookup('Q4164871'), # position self.lookup('Q216353') # title ]) self.extractor = sling.api.FactExtractor(self.kb)
def generate(args): wiki_kb = load_kb( os.path.join(args.data_dir, "{}.wiki".format(args.dataset))) src, tgt = generate_tmp_file( os.path.join(args.data_dir, "{}.pm".format(args.dataset)), wiki_kb) script = "python {}/translate.py -model {} -src {} -output {} -gpu 0 -replace_unk -verbose".format( OpenNMT_dir, args.model, src, args.out) os.system(script) unlink_tmp_file(src) unlink_tmp_file(tgt)
def shell(): kb = load_kb("local/data/e/wiki/kb.sling") extractor = sling.api.FactExtractor(kb) matcher = FactMatcher(kb, extractor) parses = "local/data/e/wikicat/filtered-parses.rec" db = sling.RecordDatabase(parses) while True: item = raw_input("Enter item or category QID:") # See if a category QID was entered, if so, compute and output match # statistics for all its parses. value = db.lookup(item) if value is not None: store = sling.Store(kb) category = store.parse(value) output = matcher.for_parses(category, store, max_evidences=4) print "%s = %s (%d members)" % \ (item, category.name, len(category.members)) for idx, (parse, match) in enumerate(zip(category("parse"), output)): print "%d. %s" % (idx, ' '.join(parse.signature)) for span, span_match in zip(parse.spans, match): print " %s = (%s=%s) : %s" % \ (span.signature, str(list(span.pids)), span.qid, \ str(span_match)) print "" print "" continue item = kb[item] pids = raw_input("Enter [comma-separated] pid(s):") pids = filter(None, pids.replace(' ', '').split(',')) for pid in pids: assert pid in kb, pid pids = [kb[p] for p in pids] qid = raw_input("Enter qid:") assert qid in kb, qid qid = kb[qid] output = matcher.for_item(item, pids, qid) print item, "(" + item.name + ") :", \ output[0].name, "evidence: ", output[1] print ""
def test_fact_matcher(): RED = "\033[1;31m" GREEN = "\033[0;32m" RESET = "\033[0;0m" def error(entry, message): sys.stdout.write(RED) print "[FAILED] ", sys.stdout.write(RESET) print entry, ":", message def success(entry): sys.stdout.write(GREEN) print "[SUCCESS] ", sys.stdout.write(RESET) print entry kb = load_kb("local/data/e/wiki/kb.sling") extractor = sling.api.FactExtractor(kb) matcher = FactMatcher(kb, extractor) # Test cases. tuples = [] # Adds the given test case and its reverse test case too (if possible). def add(pid, existing, proposed, match_type): tuples.append((pid, existing, proposed, match_type)) # Add the reverse case. if match_type != FactMatchType.NEW and existing != proposed: rev_type = match_type if match_type == FactMatchType.SUBSUMED_BY_EXISTING: rev_type = FactMatchType.SUBSUMES_EXISTING if match_type == FactMatchType.SUBSUMES_EXISTING: rev_type = FactMatchType.SUBSUMED_BY_EXISTING tuples.append((pid, proposed, existing, rev_type)) # Place of birth, Kapiolani Medical Center, Honolulu. add("P19", "Q6366688", "Q18094", FactMatchType.SUBSUMES_EXISTING) # Place of birth, Kapiolani Medical Center, US. add("P19", "Q6366688", "Q30", FactMatchType.SUBSUMES_EXISTING) # Place of birth, <no existing value>, US. add("P19", "", "Q30", FactMatchType.NEW) # Place of birth, US, US. add("P19", "Q30", "Q30", FactMatchType.EXACT) # Place of birth, Honolulu, Chicago. add("P19", "Q18094", "Q1297", FactMatchType.CONFLICT) # Children, Malia Obama, Sasha Obama. add("P40", "Q15070044", "Q15070048", FactMatchType.ADDITIONAL) # Date-valued properties: int values. # Note: P585 = point in time (unique valued), P580 = start time (non unique) add("P585", 1961, 19610804, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", 1961, 196108, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", 1961, 1961, FactMatchType.EXACT) add("P585", 1961, 196, FactMatchType.SUBSUMES_EXISTING) # 196 = 196X (decade) add("P585", 1961, 19, FactMatchType.SUBSUMES_EXISTING) # 19 = 19XX (century) add("P585", 1961, 1, FactMatchType.SUBSUMES_EXISTING) # 1 = 1XXX (millenium) add("P585", 1962, 19610804, FactMatchType.CONFLICT) add("P585", 1962, 196108, FactMatchType.CONFLICT) add("P585", 1962, 1961, FactMatchType.CONFLICT) add("P580", 1961, 1962, FactMatchType.ADDITIONAL) # Date-valued properties: string values. add("P585", "1961", "1961-08-04", FactMatchType.SUBSUMED_BY_EXISTING) add("P585", "1961", "1961-08", FactMatchType.SUBSUMED_BY_EXISTING) add("P585", "1961", "1961", FactMatchType.EXACT) add("P585", "1961", "196*", FactMatchType.SUBSUMES_EXISTING) # decade add("P585", "1961", "19**", FactMatchType.SUBSUMES_EXISTING) # century add("P585", "1961", "1***", FactMatchType.SUBSUMES_EXISTING) # millenium add("P585", "1962", "1961-08-04", FactMatchType.CONFLICT) add("P585", "1962", "1961-08", FactMatchType.CONFLICT) add("P585", "1962", "1961", FactMatchType.CONFLICT) add("P580", "1961", "1962-08", FactMatchType.ADDITIONAL) # Date-valued properties: QID values. These are only available for years, # decades, and millenia. q1961 = "Q3696" q1962 = "Q2764" q196x = "Q35724" q197x = "Q35014" q19xx = "Q6927" q1xxx = "Q25860" add("P585", q196x, q1961, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q1xxx, q1961, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q1961, q1961, FactMatchType.EXACT) add("P585", q1961, q1962, FactMatchType.CONFLICT) add("P585", q196x, q197x, FactMatchType.CONFLICT) add("P585", q19xx, q197x, FactMatchType.SUBSUMED_BY_EXISTING) add("P580", q1961, q197x, FactMatchType.ADDITIONAL) # Date-valued properties: proposed and existing values have different types. add("P585", q1961, 1961, FactMatchType.EXACT) add("P585", q196x, 196, FactMatchType.EXACT) add("P585", q19xx, 19, FactMatchType.EXACT) add("P585", q1xxx, 1, FactMatchType.EXACT) add("P585", q196x, 1961, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q1961, 19610804, FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q1961, 19, FactMatchType.SUBSUMES_EXISTING) add("P585", q1961, "1961", FactMatchType.EXACT) add("P585", q196x, "196*", FactMatchType.EXACT) add("P585", q19xx, "19**", FactMatchType.EXACT) add("P585", q196x, "1961", FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q196x, "1961-08-04", FactMatchType.SUBSUMED_BY_EXISTING) add("P585", q1961, "196*", FactMatchType.SUBSUMES_EXISTING) add("P585", "", "196*", FactMatchType.NEW) add("P585", q1961, "1962", FactMatchType.CONFLICT) add("P585", 1963, "1962", FactMatchType.CONFLICT) add("P580", q1961, "1962", FactMatchType.ADDITIONAL) add("P580", 1963, "1962", FactMatchType.ADDITIONAL) # Genre, melodrama, drama. add("P136", "Q191489", "Q21010853", FactMatchType.SUBSUMES_EXISTING) # Genre, trip-hop, electronic music. add("P136", "Q205560", "Q9778", FactMatchType.SUBSUMES_EXISTING) # Genre, rock and roll, electronic music. add("P136", "Q7749", "Q9778", FactMatchType.ADDITIONAL) # Educated at, Harvard Law School, Harvard University. add("P69", "Q49122", "Q13371", FactMatchType.SUBSUMES_EXISTING) # Educated at, Harvard Law School, Yale University. add("P69", "Q49122", "Q49112", FactMatchType.ADDITIONAL) # Employer, Airbus, Airbus SE. add("P108", "Q67", "Q2311", FactMatchType.SUBSUMES_EXISTING) # Employer, Airbus, Boeing. add("P108", "Q67", "Q66", FactMatchType.ADDITIONAL) # Occupation, sports cyclist, cyclist. add("P106", "Q2309784", "Q2125610", FactMatchType.SUBSUMES_EXISTING) # Occupation, sports cyclist, cricketer. add("P106", "Q2309784", "Q12299841", FactMatchType.ADDITIONAL) store = sling.Store(kb) total_successes = 0 for entry in tuples: pid, existing, proposed, expected = entry if pid not in kb: error(entry, "%s not in KB" % pid) continue pid = kb[pid] if isinstance(existing, str) and existing != "" and existing in kb: existing = kb[existing] if isinstance(proposed, str) and proposed in kb: proposed = kb[proposed] if existing == "": existing = [] else: existing = [existing] actual = matcher.match_type(store, pid, existing, proposed) if actual == expected: success(entry) total_successes += 1 else: error(entry, "Got %s, but expected %s" % (actual.name, expected.name)) print "Total successful tests: %d out of %d" % (total_successes, len(tuples))
def init(self, task): self.kb = load_kb(task) self.extractor = sling.FactExtractor(self.kb) self.matcher = FactMatcher(self.kb, self.extractor)