Beispiel #1
0
 def __init__(self, filename, commons=None):
     self.input = sling.RecordDatabase(filename)
     self.iter = iter(self.input)
     if commons == None:
         self.commons = sling.Store()
         self.docschema = sling.DocumentSchema(self.commons)
         self.commons.freeze()
     else:
         self.commons = commons
         if "document" in commons:
             self.docschema = sling.DocumentSchema(commons)
         else:
             self.docschema = None
Beispiel #2
0
def shell():
    kb = load_kb("local/data/e/wiki/kb.sling")
    extractor = sling.api.FactExtractor(kb)
    matcher = FactMatcher(kb, extractor)

    parses = "local/data/e/wikicat/filtered-parses.rec"
    db = sling.RecordDatabase(parses)

    while True:
        item = raw_input("Enter item or category QID:")

        # See if a category QID was entered, if so, compute and output match
        # statistics for all its parses.
        value = db.lookup(item)
        if value is not None:
            store = sling.Store(kb)
            category = store.parse(value)
            output = matcher.for_parses(category, store, max_evidences=4)
            print "%s = %s (%d members)" % \
              (item, category.name, len(category.members))
            for idx, (parse, match) in enumerate(zip(category("parse"),
                                                     output)):
                print "%d. %s" % (idx, ' '.join(parse.signature))
                for span, span_match in zip(parse.spans, match):
                    print "  %s = (%s=%s) : %s" % \
                      (span.signature, str(list(span.pids)), span.qid, \
                       str(span_match))
                print ""
            print ""
            continue

        item = kb[item]

        pids = raw_input("Enter [comma-separated] pid(s):")
        pids = filter(None, pids.replace(' ', '').split(','))
        for pid in pids:
            assert pid in kb, pid
        pids = [kb[p] for p in pids]

        qid = raw_input("Enter qid:")
        assert qid in kb, qid
        qid = kb[qid]

        output = matcher.for_item(item, pids, qid)
        print item, "(" + item.name + ") :", \
          output[0].name, "evidence: ", output[1]
        print ""
Beispiel #3
0
    def __init__(self):
        # Initialize commons store with knowledge base.
        start = time.time()
        self.commons = sling.Store()
        self.commons.lockgc()
        self.commons.load(wikidir + "/kb.sling", snapshot=True)
        self.n_item_member = self.commons['/w/item/member']
        self.n_instance_of = self.commons['P31']
        self.n_wikimedia_category = self.commons['Q4167836']
        self.n_subject = self.commons['subject']
        self.extractor = sling.FactExtractor(self.commons)

        # Add category subject types.
        self.subjects = {}
        for subject, item in english_subject_types.iteritems():
            self.subjects[subject] = self.commons[item]

        # Add properties for subjects.
        self.subject_properties = []
        for p in subject_properties:
            self.subject_properties.append(self.commons[p])

        self.commons.freeze()
        end = time.time()
        print end - start, "secs loading commons"

        # Load phrase table.
        # TODO(ringgaard): Load language-dependent phrase table.
        start = time.time()
        self.phrasetab = sling.PhraseTable(self.commons,
                                           wikidir + "/en/phrase-table.repo")
        end = time.time()
        print end - start, "secs loading phrase table"

        # Open category member database.
        self.member_db = sling.RecordDatabase(wikidir +
                                              "/wikipedia-members.rec")
Beispiel #4
0
    def __init__(self):
        self.site = pywikibot.Site("wikidata", "wikidata")
        self.repo = self.site.data_repository()

        time_str = datetime.datetime.now().isoformat("-")[:19].replace(
            ":", "-")
        if flags.arg.test:
            record_file_name = "local/data/e/wikibot/test-" + flags.arg.input + ".rec"
            time_str = "test-" + time_str
        else:
            record_file_name = "local/data/e/wikibot/" + flags.arg.input + ".rec"
        status_file_name = "local/logs/wikibotlog-" + time_str + ".rec"
        self.record_file = sling.RecordReader(record_file_name)
        self.status_file = sling.RecordWriter(status_file_name)

        self.store = sling.Store()
        self.store.lockgc()
        print("loading kb")
        self.store.load("local/data/e/wiki/kb.sling")
        print("kb loaded")

        self.page_cat = self.store["/wp/page/category"]

        self.date_of_birth = self.store['P569']
        self.date_of_death = self.store['P570']

        self.n_item = self.store["item"]
        self.n_facts = self.store["facts"]
        self.n_provenance = self.store["provenance"]
        self.n_category = self.store["category"]
        self.n_url = self.store["url"]
        self.n_method = self.store["method"]
        self.n_status = self.store["status"]
        self.n_revision = self.store["revision"]
        self.n_skipped = self.store["skipped"]
        self.store.freeze()
        self.rs = sling.Store(self.store)

        self.wiki = {'fr': 'Q8447',    'en': 'Q328',    'da': 'Q181163', \
                     'pt': 'Q11921',   'fi': 'Q175482', 'es': 'Q8449', \
                     'pl': 'Q1551807', 'de': 'Q48183',  'nl': 'Q10000', \
                     'sv': 'Q169514',  'it': 'Q11920',  'no': 'Q191769'}
        self.languages = self.wiki.keys()
        self.wiki_sources = {}
        for lang, wp in self.wiki.items():
            # P143 means 'imported from Wikimedia project'
            source_claim = pywikibot.Claim(self.repo, "P143")
            target = pywikibot.ItemPage(self.repo, wp)
            source_claim.setTarget(target)
            self.wiki_sources[lang] = source_claim
        self.record_db = {}
        fname = "local/data/e/wiki/{}/[email protected]"
        for lang in self.languages:
            self.record_db[lang] = sling.RecordDatabase(fname.format(lang))

        # inferred from
        self.source_claim = pywikibot.Claim(self.repo, "P3452")
        # Wikimedia import URL
        self.url_source_claim = pywikibot.Claim(self.repo, "P4656")
        # imported from Wikimedia project
        self.wp_source_claim = pywikibot.Claim(self.repo, "P143")
        self.en_wp = pywikibot.ItemPage(self.repo, "Q328")
        self.wp_source_claim.setTarget(self.en_wp)

        # referenced (on)
        self.time_claim = pywikibot.Claim(self.repo, "P813")
        today = datetime.date.today()
        time_target = pywikibot.WbTime(year=today.year,
                                       month=today.month,
                                       day=today.day)
        self.time_claim.setTarget(time_target)

        self.uniq_prop = {self.date_of_birth, self.date_of_death}
        kb = self.store
        # Collect unique-valued properties.
        # They will be used to update claims in Wikidata accordingly.
        constraint_role = kb["P2302"]
        unique = kb["Q19474404"]  # single-value constraint
        for prop in kb["/w/entity"]("role"):
            for constraint_type in prop(constraint_role):
                if kb.resolve(constraint_type) == unique:
                    self.uniq_prop.add(prop)
Beispiel #5
0
    #'Q25048736', 'Q6525874', 'Q3851366', 'Q308735', 'Q2184354',
    'Q5337174',
    'Q6218080',
    'Q1606412',
    'Q7264446',
    'Q2263863',
    'Q834815',
    'Q2583807',
    'Q42887751',
    'Q57652',  # Helle Thorning-Schmidt
    'Q1636974',  # Danske Bank
    'Q186285',  # University of Copenhagen
    'Q1687170',  # Jens Christian Skou
]

articles = sling.RecordDatabase("data/e/wiki/en/[email protected]")
output = sling.RecordWriter("/tmp/chunked.rec")

for docid in documentids:
    # Read document from article database.
    store = sling.Store(commons)
    if docid.startswith("Q"):
        record = articles.lookup(docid)
        article = store.parse(record)
        document = sling.Document(article, schema=docschema)
        document.remove_annotations()
        document.update()
    else:
        document = sling.tokenize(docid, store=store, schema=docschema)

    print document.frame["title"]