def __init__(self, filename, commons=None): self.input = sling.RecordDatabase(filename) self.iter = iter(self.input) if commons == None: self.commons = sling.Store() self.docschema = sling.DocumentSchema(self.commons) self.commons.freeze() else: self.commons = commons if "document" in commons: self.docschema = sling.DocumentSchema(commons) else: self.docschema = None
def shell(): kb = load_kb("local/data/e/wiki/kb.sling") extractor = sling.api.FactExtractor(kb) matcher = FactMatcher(kb, extractor) parses = "local/data/e/wikicat/filtered-parses.rec" db = sling.RecordDatabase(parses) while True: item = raw_input("Enter item or category QID:") # See if a category QID was entered, if so, compute and output match # statistics for all its parses. value = db.lookup(item) if value is not None: store = sling.Store(kb) category = store.parse(value) output = matcher.for_parses(category, store, max_evidences=4) print "%s = %s (%d members)" % \ (item, category.name, len(category.members)) for idx, (parse, match) in enumerate(zip(category("parse"), output)): print "%d. %s" % (idx, ' '.join(parse.signature)) for span, span_match in zip(parse.spans, match): print " %s = (%s=%s) : %s" % \ (span.signature, str(list(span.pids)), span.qid, \ str(span_match)) print "" print "" continue item = kb[item] pids = raw_input("Enter [comma-separated] pid(s):") pids = filter(None, pids.replace(' ', '').split(',')) for pid in pids: assert pid in kb, pid pids = [kb[p] for p in pids] qid = raw_input("Enter qid:") assert qid in kb, qid qid = kb[qid] output = matcher.for_item(item, pids, qid) print item, "(" + item.name + ") :", \ output[0].name, "evidence: ", output[1] print ""
def __init__(self): # Initialize commons store with knowledge base. start = time.time() self.commons = sling.Store() self.commons.lockgc() self.commons.load(wikidir + "/kb.sling", snapshot=True) self.n_item_member = self.commons['/w/item/member'] self.n_instance_of = self.commons['P31'] self.n_wikimedia_category = self.commons['Q4167836'] self.n_subject = self.commons['subject'] self.extractor = sling.FactExtractor(self.commons) # Add category subject types. self.subjects = {} for subject, item in english_subject_types.iteritems(): self.subjects[subject] = self.commons[item] # Add properties for subjects. self.subject_properties = [] for p in subject_properties: self.subject_properties.append(self.commons[p]) self.commons.freeze() end = time.time() print end - start, "secs loading commons" # Load phrase table. # TODO(ringgaard): Load language-dependent phrase table. start = time.time() self.phrasetab = sling.PhraseTable(self.commons, wikidir + "/en/phrase-table.repo") end = time.time() print end - start, "secs loading phrase table" # Open category member database. self.member_db = sling.RecordDatabase(wikidir + "/wikipedia-members.rec")
def __init__(self): self.site = pywikibot.Site("wikidata", "wikidata") self.repo = self.site.data_repository() time_str = datetime.datetime.now().isoformat("-")[:19].replace( ":", "-") if flags.arg.test: record_file_name = "local/data/e/wikibot/test-" + flags.arg.input + ".rec" time_str = "test-" + time_str else: record_file_name = "local/data/e/wikibot/" + flags.arg.input + ".rec" status_file_name = "local/logs/wikibotlog-" + time_str + ".rec" self.record_file = sling.RecordReader(record_file_name) self.status_file = sling.RecordWriter(status_file_name) self.store = sling.Store() self.store.lockgc() print("loading kb") self.store.load("local/data/e/wiki/kb.sling") print("kb loaded") self.page_cat = self.store["/wp/page/category"] self.date_of_birth = self.store['P569'] self.date_of_death = self.store['P570'] self.n_item = self.store["item"] self.n_facts = self.store["facts"] self.n_provenance = self.store["provenance"] self.n_category = self.store["category"] self.n_url = self.store["url"] self.n_method = self.store["method"] self.n_status = self.store["status"] self.n_revision = self.store["revision"] self.n_skipped = self.store["skipped"] self.store.freeze() self.rs = sling.Store(self.store) self.wiki = {'fr': 'Q8447', 'en': 'Q328', 'da': 'Q181163', \ 'pt': 'Q11921', 'fi': 'Q175482', 'es': 'Q8449', \ 'pl': 'Q1551807', 'de': 'Q48183', 'nl': 'Q10000', \ 'sv': 'Q169514', 'it': 'Q11920', 'no': 'Q191769'} self.languages = self.wiki.keys() self.wiki_sources = {} for lang, wp in self.wiki.items(): # P143 means 'imported from Wikimedia project' source_claim = pywikibot.Claim(self.repo, "P143") target = pywikibot.ItemPage(self.repo, wp) source_claim.setTarget(target) self.wiki_sources[lang] = source_claim self.record_db = {} fname = "local/data/e/wiki/{}/[email protected]" for lang in self.languages: self.record_db[lang] = sling.RecordDatabase(fname.format(lang)) # inferred from self.source_claim = pywikibot.Claim(self.repo, "P3452") # Wikimedia import URL self.url_source_claim = pywikibot.Claim(self.repo, "P4656") # imported from Wikimedia project self.wp_source_claim = pywikibot.Claim(self.repo, "P143") self.en_wp = pywikibot.ItemPage(self.repo, "Q328") self.wp_source_claim.setTarget(self.en_wp) # referenced (on) self.time_claim = pywikibot.Claim(self.repo, "P813") today = datetime.date.today() time_target = pywikibot.WbTime(year=today.year, month=today.month, day=today.day) self.time_claim.setTarget(time_target) self.uniq_prop = {self.date_of_birth, self.date_of_death} kb = self.store # Collect unique-valued properties. # They will be used to update claims in Wikidata accordingly. constraint_role = kb["P2302"] unique = kb["Q19474404"] # single-value constraint for prop in kb["/w/entity"]("role"): for constraint_type in prop(constraint_role): if kb.resolve(constraint_type) == unique: self.uniq_prop.add(prop)
#'Q25048736', 'Q6525874', 'Q3851366', 'Q308735', 'Q2184354', 'Q5337174', 'Q6218080', 'Q1606412', 'Q7264446', 'Q2263863', 'Q834815', 'Q2583807', 'Q42887751', 'Q57652', # Helle Thorning-Schmidt 'Q1636974', # Danske Bank 'Q186285', # University of Copenhagen 'Q1687170', # Jens Christian Skou ] articles = sling.RecordDatabase("data/e/wiki/en/[email protected]") output = sling.RecordWriter("/tmp/chunked.rec") for docid in documentids: # Read document from article database. store = sling.Store(commons) if docid.startswith("Q"): record = articles.lookup(docid) article = store.parse(record) document = sling.Document(article, schema=docschema) document.remove_annotations() document.update() else: document = sling.tokenize(docid, store=store, schema=docschema) print document.frame["title"]