def __init__(self): self.kb = sling.Store() self.kb.lockgc() self.kb.load("local/data/e/wiki/kb.sling", snapshot=True) self.instanceof = self.kb['P31'] self.has_part = self.kb['P527'] self.part_of = self.kb['P361'] self.item_category = self.kb['/w/item/category'] self.date_of_birth = self.kb['P569'] self.date_of_death = self.kb['P570'] self.inception = self.kb['P571'] self.dad = self.kb['P576'] # dissolved, abolished or demolished self.wikimedia_category = self.kb['Q4167836'] self.date_types = [ self.kb['Q29964144'], # year BC self.kb['Q577'], # year self.kb['Q39911'], # decade self.kb['Q578'], # century self.kb['Q36507'], # millennium ] self.human = self.kb['Q5'] self.item = self.kb["item"] self.facts = self.kb["facts"] self.provenance = self.kb["provenance"] self.category = self.kb["category"] self.method = self.kb["method"] self.calendar = sling.Calendar(self.kb) self.names = sling.PhraseTable( self.kb, "local/data/e/wiki/en/phrase-table.repo") self.kb.freeze() self.date_type = {} self.conflicts = 0
def init_phrase_table(): base_path = '../../sling/' kb = sling.Store() kb.load(os.path.join(base_path, 'local/data/e/wiki/kb.sling')) names = sling.PhraseTable( kb, os.path.join(base_path, 'local/data/e/wiki/en/phrase-table.repo')) kb.freeze() return names
def init(self, task): self.kb = load_kb(task) self.names = sling.PhraseTable(self.kb, task.input("phrase-table").name) self.min_members = int(task.param("min_members")) self.num_parses_bins = [1, 2, 3, 5, 10, 20, 50, 100, 200] # Lookup some handles in advance. self.h_language = self.lookup("/lang/" + task.param("language")) self.h_lang = self.lookup("lang") self.main_topic = self.lookup("P301") # present in topical categories self.h_member = self.lookup("/w/item/member") self.h_instanceof = self.lookup('P31') self.h_subclassof = self.lookup('P279') self.h_category = self.lookup('Q4167836') self.h_category_contains = self.lookup('P4224') self.english = task.param("language") == "en" # The following kinds of categories won't be processed. self.uninteresting_categories = set([ self.lookup('Q20769287'), # disambiguation category self.lookup('Q15407973'), # list category #self.lookup('Q56428020'), # template category self.lookup('Q23894233'), # stub category self.lookup('Q24046192'), # admin category self.lookup('Q15647814'), # user category self.lookup('Q20010800'), # user language category self.lookup('Q30432511'), # meta category self.lookup('Q13331174') # navbox category ]) # These pids will not be considered as resolution for spans. self.pids_to_ignore = set([ self.h_instanceof, # P31 = instance of self.lookup('P279'), # P279 = subclass of self.lookup('P971'), # P971 = category combines topics self.lookup('P4224'), # P4224 = category contains ]) # These QIDs will not be considered as resolutions for spans. self.base_qids = set([ self.lookup('Q5'), # human self.lookup('Q215627'), # person self.lookup('Q17334923'), # location self.lookup('Q811430'), # construction self.lookup('Q43229'), # organization self.lookup('Q2385804'), # educational institution self.lookup('Q294163'), # public institution self.lookup('Q15401930'), # product self.lookup('Q12737077'), # occupation self.lookup('Q192581'), # job self.lookup('Q4164871'), # position self.lookup('Q216353') # title ]) self.extractor = sling.api.FactExtractor(self.kb)
def __init__(self): self.kb = sling.Store() self.kb.lockgc() self.kb.load("local/data/e/wiki/kb.sling", snapshot=True) self.instanceof = self.kb['P31'] self.has_part = self.kb['P527'] self.part_of = self.kb['P361'] self.subclass = self.kb['P279'] self.item_category = self.kb['/w/item/category'] self.date_of_birth = self.kb['P569'] self.date_of_death = self.kb['P570'] self.inception = self.kb['P571'] self.dad = self.kb['P576'] # dissolved, abolished or demolished self.wikimedia_category = self.kb['Q4167836'] self.date_types = [ self.kb['Q29964144'], # year BC self.kb['Q577'], # year self.kb['Q39911'], # decade self.kb['Q578'], # century self.kb['Q36507'], # millennium ] self.human = self.kb['Q5'] self.business = self.kb['Q4830453'] self.organization = self.kb['Q43229'] self.item = self.kb["item"] self.facts = self.kb["facts"] self.provenance = self.kb["provenance"] self.category = self.kb["category"] self.method = self.kb["method"] self.names = sling.PhraseTable( self.kb, "local/data/e/wiki/en/phrase-table.repo") self.kb.freeze() self.date_type = {} self.conflicts = 0 self.months = { "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12 }
def pruneTriples(triple_file, output_file): """ ******** This method needs to be called in a python env that has sling!!! ******* Input: 1. name of file with tab sepperated SRO triples 2. name of the output file Output: file containing triples where both entities have QCodes. """ t1 = time.time() # start the clock base_path = cfg['all']['base_path'] kb = sling.Store() kb.load(base_path + "local/data/e/wiki/kb.sling") names = sling.PhraseTable( kb, base_path + "local/data/e/wiki/en/phrase-table.repo") kb.freeze() print("* Sling KB loaded in %0.3fs." % (time.time() - t1)) verified_triples = [] with open(triple_file, "r") as inFile: tsvreader = csv.reader(inFile, delimiter="\t") for triple in tsvreader: entity1 = triple[0] relation = triple[1] entity2 = triple[2] score = triple[3] hasBoth, qcode1, qcode2 = confirmEntities(names, entity1, entity2) if (hasBoth): q_triple = (qcode1, relation, qcode2, score) verified_triples.append(q_triple) with open(output_file, "w") as outFile: writer = csv.writer(outFile, delimiter='\t') for t in verified_triples: s = t[0] # subject r = t[1] # relation o = t[2] # object c = t[3] # confidence score writer.writerow([s, r, o, c])
def __init__(self): # Initialize commons store with knowledge base. start = time.time() self.commons = sling.Store() self.commons.lockgc() self.commons.load(wikidir + "/kb.sling", snapshot=True) self.n_item_member = self.commons['/w/item/member'] self.n_instance_of = self.commons['P31'] self.n_wikimedia_category = self.commons['Q4167836'] self.n_subject = self.commons['subject'] self.extractor = sling.FactExtractor(self.commons) # Add category subject types. self.subjects = {} for subject, item in english_subject_types.iteritems(): self.subjects[subject] = self.commons[item] # Add properties for subjects. self.subject_properties = [] for p in subject_properties: self.subject_properties.append(self.commons[p]) self.commons.freeze() end = time.time() print end - start, "secs loading commons" # Load phrase table. # TODO(ringgaard): Load language-dependent phrase table. start = time.time() self.phrasetab = sling.PhraseTable(self.commons, wikidir + "/en/phrase-table.repo") end = time.time() print end - start, "secs loading phrase table" # Open category member database. self.member_db = sling.RecordDatabase(wikidir + "/wikipedia-members.rec")
# 0: Registration Authority Code # 1: Country # 2: Country Code # 3: Jurisdiction (country or region) # 4: International name of Register # 5: Local name of Register # 6: International name of organisation responsible for the Register # 7: Local name of organisation responsible for the Register # 8: Website # 9: Date IP disclaimer # 10: Comments # 11: End Date kb = sling.Store() kb.load("data/e/kb/kb.sling") aliases = sling.PhraseTable(kb, "data/e/kb/en/phrase-table.repo") def resolve_name(name): for item in aliases.lookup(name): return item return None reader = csv.reader(open("data/c/lei/2019-12-05_ra-list-v1.5.csv", "r")) reader.__next__() for row in reader: slots = [("registration_authority_code", row[0]), ("country_name", row[1]), ("country_code", row[2]), ("country", resolve_name(row[1]))]
dictionary[key] = dictionary.get(key, 0) + delta def fact_to_text(fact): l = [] for f in fact: l.append(str(f.name)) return ": ".join(l) commons = sling.Store() commons.lockgc() commons.load(wikidir + "/kb.sling", snapshot=True) n_is = commons["is"] extractor = sling.FactExtractor(commons) phrasetab = sling.PhraseTable(commons, wikidir + "/en/phrase-table.repo") commons.freeze() class Name: def __init__(self, doc): self.doc = doc self.store = doc.frame.store() self.covered = [False] * len(doc.tokens) self.evokes = {} self.matched = set() self.skip = [] for t in self.doc.tokens: self.skip.append(t.word in stop_words) def overlaps(self, begin, end):
self.elements = [None] * (size * size) def index(self, i, j): return i * self.size + j - 1 # second index is 1-based def assign(self, i, j, span): self.elements[self.index(i, j)] = span def get(self, i, j): return self.elements[self.index(i, j)] commons = sling.Store() commons.lockgc() commons.load("data/e/wiki/kb.sling") phrasetab = sling.PhraseTable(commons, "data/e/wiki/en/phrase-table.repo") docschema = sling.DocumentSchema(commons) factex = sling.FactExtractor(commons) taxonomy = factex.taxonomy() titles = [ commons['Q4164871'], # position commons['Q12737077'], # occupation commons['Q216353'], # title ] commons.freeze() documentids = [ #'Q5945076', 'Q23883660', 'Q43287478', 'Q2147524', #'Q25048736', 'Q6525874', 'Q3851366', 'Q308735', 'Q2184354', 'Q5337174', 'Q6218080',