Python Storeの例、sling.Store Pythonの例

コード例 #1

0

ファイルを表示

    def find_inceptions(self, inc_cats):
        self.out_file = "data/e/wikibot/inc-dates.rec"
        record_file = sling.RecordWriter(self.out_file)
        records = 0
        store = sling.Store(self.kb)
        types = {}

        for item in self.kb:
            if self.wikimedia_category in item(self.instanceof): continue
            if self.human in item(self.instanceof): continue
            if not self.is_org(item): continue
            name = item.name
            if name is not None and name.startswith("Category:"): continue
            if item[self.inception] is not None: continue
            cat_dates = []
            # Collect all the item's inception categories in cat_dates
            for cat in item(self.item_category):
                cat_inc_date = inc_cats.get(cat)
                if cat_inc_date is None: continue
                cat_dates.append((cat, cat_inc_date))
            if not cat_dates:
                continue  # no inception categories found for item
            msd = self.most_specific_date(cat_dates)
            if msd is None: continue
            (inc_cat, inc_date) = msd
            records += 1

            facts = store.frame({self.inception: sling.Date(inc_date).value()})
            provenance = store.frame({
                self.category:
                inc_cat,
                self.method:
                "Member of an inception category, '" + inc_cat.name + "'"
            })
            fact = store.frame({
                self.item: item,
                self.facts: facts,
                self.provenance: provenance
            })
            record_file.write(item.id, fact.data(binary=True))

        record_file.close()
        print records, "inception date records written to file:", self.out_file
        print self.conflicts, "conflicts encountered"

コード例 #2

0

ファイルを表示

ファイル: sling_prep.py プロジェクト: jzbjyb/X-FACTR

 def iter_mentions(self, wid_set: Set[str]=None, only_entity: bool=False, split_by: str=None) -> \
         Iterable[Tuple[str, sling.Document, List[Tuple[str, int, int]]]]:
     assert split_by in {'sentence', None}, 'not supported split_by'
     split_by = {'sentence': 3, None: None}[split_by]
     for n, (doc_wid, doc_raw) in enumerate(self.corpus.input):
         doc_wid = str(doc_wid, 'utf-8')
         if wid_set is not None and doc_wid not in wid_set:
             continue
         store = sling.Store(self.commons)
         frame = store.parse(doc_raw)
         document = sling.Document(frame, store, self.docschema)
         sorted_mentions = sorted(document.mentions, key=lambda m: m.begin)
         tokens = [t.word for t in document.tokens]
         split_start = [0] + [
             i for i, t in enumerate(document.tokens) if t.brk == split_by
         ]
         split_ind = 0
         mentions: List[Tuple[str, int, int]] = []
         for mention in sorted_mentions:
             while len(split_start
                       ) > split_ind + 1 and mention.begin >= split_start[
                           split_ind + 1]:
                 if len(mentions) > 0:
                     yield (
                         doc_wid,
                         tokens[split_start[split_ind]:split_start[split_ind
                                                                   + 1]],
                         mentions)
                     mentions = []
                 split_ind += 1
             if len(split_start
                    ) > split_ind + 1 and mention.end > split_start[
                        split_ind + 1]:
                 # skip mentions beyond the boundary
                 continue
             linked_entity = self.get_linked_entity(mention)
             if only_entity and (type(linked_entity) is not str
                                 or not linked_entity.startswith('Q')):
                 continue
             mentions.append(
                 (linked_entity, mention.begin - split_start[split_ind],
                  mention.end - split_start[split_ind]))
         if len(mentions) > 0:
             yield (doc_wid, tokens[split_start[split_ind]:], mentions)

コード例 #3

0

ファイルを表示

ファイル: util.py プロジェクト: wolf1981/sling

def load_kb(task):
    if type(task) is str:
        filename = task  # assume filename
    else:
        filename = task.input("kb").name

    if filename in _kb_cache:
        log.info("Retrieving cached KB")
        return _kb_cache[filename]
    else:
        kb = sling.Store()
        kb.load(filename)
        log.info("Knowledge base read")
        kb.lockgc()
        kb.freeze()
        kb.unlockgc()
        log.info("Knowledge base frozen")
        _kb_cache[filename] = kb
        return kb

コード例 #4

0

ファイルを表示

def get_media_files():
  # Load knowledge base.
  kb = sling.Store()
  kb.load(flags.arg.kb)

  n_media = kb["/w/media"]
  n_role = kb["role"]
  n_target = kb["target"]
  p_media = kb["media"]

  # Find all properties for WikiCommons files.
  imageprops = set()
  for name, prop in kb["/w/entity"]:
    if name != n_role: continue;
    if prop[n_target] == n_media:
      imageprops.add(prop)

  # Find media files for all items.
  media = []
  for item in kb:
    for n, v in item:
      if n in imageprops:
        # Add Wikimedia Commons url.
        v = kb.resolve(v)
        if type(v) == str:
          fn = v.replace(' ', '_')
          md5 = md5hash(fn)
          fn = fn.replace("?", "%3F")
          fn = fn.replace("+", "%2B")
          fn = fn.replace("&", "%26")
          url = "%s/%s/%s/%s" % (commons_base_url, md5[0], md5[0:2], fn)
          media.append(url)
        else:
          print("Bad media file name:", item.id, v)
      elif n == p_media:
        # Add media url.
        v = kb.resolve(v)
        if type(v) == str:
          media.append(v)
        else:
          print("Bad media url:", item.id, v)

  return media

コード例 #5

0

ファイルを表示

 def read(self, parses_filename):
   reader = sling.RecordReader(parses_filename)
   self.category_name_to_qid = {}                      # category name -> qid
   self.category_frame = {}                            # category qid -> frame
   self.full_signature_to_parse = defaultdict(list)    # signature -> parse
   self.coarse_signature_to_parse = defaultdict(list)  # signature -> parse
   store = sling.Store()
   for index, (qid, value) in enumerate(reader):
     if index > 0 and index % 20000 == 0:
       log.info("%d categories read" % index)
     frame = store.parse(value)
     self.category_name_to_qid[frame.name] = qid
     self.category_frame[qid] = frame
     for parse in frame("parse"):
       element = (qid, frame, parse)
       full_signature = util.full_parse_signature(parse)
       self.full_signature_to_parse[full_signature].append(element)
       coarse_signature = util.coarse_parse_signature(parse)
       self.coarse_signature_to_parse[coarse_signature].append(element)

コード例 #6

0

ファイルを表示

    def __init__(self, body=None, store=None, schema=None):
        if body != None: store = body.store()
        if store == None: store = sling.Store()
        if schema == None: schema = MeasureSchema(store)
        if body == None: body = schema.n_earth
        self.body = body
        self.schema = schema

        # Determine radius of globe and convert to metres.
        radius = self.body[self.schema.radius]
        if radius != None:
            self.radius = Quantity(radius.resolve(), schema).si().amount
        else:
            diameter = body[schema.diameter]
            if diameter != None:
                self.radius = Quantity(diameter.resolve(),
                                       schema).si().amount / 2
            else:
                self.radius = 6371000  # radius of Earth in metres

コード例 #7

0

ファイルを表示

ファイル: fact_matcher.py プロジェクト: yespon/sling

    def for_parses(self, category, store=None):
        if store is None:
            store = sling.Store(self.kb)

        items = category.members
        output = []  # ith entry = match stats for ith parse
        cache = {}  # (pid, qid) -> match stats
        for parse in category("parse"):
            parse_stats = []
            for span in parse.spans:
                key = (span.pids, span.qid)
                stats = None
                if key in cache:
                    stats = cache[key]
                else:
                    stats = self.for_items(items, span.pids, span.qid, store)
                    cache[key] = stats
                parse_stats.append(stats)
            output.append(parse_stats)
        return output

コード例 #8

0

ファイルを表示

 def run(self, task):
   self.init(task)
   reader = sling.RecordReader(task.input("parses").name)
   writer = sling.RecordWriter(task.output("output").name)
   for key, value in reader:
     store = sling.Store(self.kb)
     category = store.parse(value)
     matches = self.matcher.for_parses(category, store, max_evidences=-1)
     frame_cache = {}   # (pid, qid) -> frame containing their match statistics
     for parse, parse_match in zip(category("parse"), matches):
       for span, span_match in zip(parse.spans, parse_match):
         span_key = (span.pids, span.qid)
         if span_key not in frame_cache:
           match_frame = span_match.as_frame(store)
           frame_cache[span_key] = match_frame
         span["fact_matches"] = frame_cache[span_key]
     writer.write(key, category.data(binary=True))
     task.increment("fact-matcher/categories-processed")
   reader.close()
   writer.close()

コード例 #9

0

ファイルを表示

def load(
    record: str,
    load_tokens: bool = True,
    load_mentions: bool = True
) -> Iterable[Tuple[sling.nlp.document.Document, Tuple[int, str, str]]]:
    """load documents from a .rec file.
    Warning: this may take good amount of RAM space (each *.rec file is 5.3GB).
    """
    for k, rec in sling.RecordReader(record):
        store = sling.Store(commons)
        # parse record into frame
        doc_frame = store.parse(rec)
        # instantiate a document
        #parsed_doc = sling.Document(doc_frame, store, DOCSCHEMA)
        parsed_doc = MyDocument(doc_frame,
                                store,
                                DOCSCHEMA,
                                load_tokens=load_tokens,
                                load_mentions=load_mentions)
        metadata = get_metadata(doc_frame)
        yield parsed_doc, metadata

コード例 #10

0

ファイルを表示

def update_item(qid):
    # Fetch item revision from Wikidata.
    url = "%s?id=%s&format=json" % (flags.arg.wiki_fetch_url, qid)
    reply = wdsession.get(url)

    # Convert item to SLING format.
    store = sling.Store(commons)
    item, revision = wikiconv.convert_wikidata(store, reply.text)

    # Coalese strings.
    store.coalesce(flags.arg.string_buckets)

    # Save item in database.
    print(qid, revision)
    reply = dbsession.put(flags.arg.dburl + "/" + qid,
                          data=item.data(binary=True),
                          headers={
                              "Version": str(revision),
                              "Mode": "ordered",
                          })
    reply.raise_for_status()

コード例 #11

0

ファイルを表示

    def find_births(self, birth_cats):
        self.out_file = "local/data/e/wikibot/birth-dates.rec"
        record_file = sling.RecordWriter(self.out_file)
        records = 0

        for item in self.kb:
            if self.human not in item(self.instanceof): continue
            if item[self.date_of_birth] is not None: continue
            cat_dates = []
            # Collect all the item's birth categories in cat_dates
            for cat in item(self.item_category):
                cat_birth_date = birth_cats.get(cat)
                if cat_birth_date is None: continue
                cat_dates.append((cat, cat_birth_date))
            if not cat_dates: continue  # no birth categories found for item
            msd = self.most_specific_date(cat_dates)
            if msd is None: continue
            (birth_cat, birth_date) = msd
            records += 1
            store = sling.Store(self.kb)
            facts = store.frame({
                self.date_of_birth:
                self.calendar.value(sling.Date(birth_date))
            })
            provenance = store.frame({
                self.category:
                birth_cat,
                self.method:
                "Member of a birth category, '" + birth_cat.name + "'"
            })
            fact = store.frame({
                self.item: item,
                self.facts: facts,
                self.provenance: provenance
            })
            record_file.write(item.id, fact.data(binary=True))

        record_file.close()
        print records, "birth date records written to file:", self.out_file
        print self.conflicts, "conflicts encountered"

コード例 #12

0

ファイルを表示

ファイル: spec.py プロジェクト: savkov/sling

    def build(self, commons_path, corpora_path):
        # Prepare lexical dictionaries.
        self.words = Lexicon(self.words_normalize_digits)
        self.suffix = Lexicon(self.words_normalize_digits, oov_item=None)

        # Initialize training corpus.
        corpora = Corpora(corpora_path, commons_path)

        # Collect word and affix lexicons.
        for document in corpora:
            for token in document.tokens:
                word = token.word
                self.words.add(word)
                for s in self.get_suffixes(word):
                    assert type(s) is str
                    self.suffix.add(s)
        print "Words:", self.words.size(), "items in lexicon, including OOV"
        print "Suffix:", self.suffix.size(), "items in lexicon"

        # Load common store, but not freeze it yet. We will add the action table
        # and cascade specification to it.
        self.commons_path = commons_path
        self.commons = sling.Store()
        self.commons.load(commons_path)
        schema = sling.DocumentSchema(self.commons)

        # Prepare action table and cascade.
        self._build_action_table(corpora)
        self.cascade = cascade.ShiftMarkCascade(self.actions)
        print self.cascade

        # Save cascade specification in commons.
        _ = self.cascade.as_frame(self.commons,
                                  delegate_cell_prefix="delegate")

        # Freeze the common store.
        self.commons.freeze()

        # Add feature specs.
        self._specify_features()

コード例 #13

0

ファイルを表示

    def parse(self, obj):
        if type(obj) is sling.Document:
            # Parser document.
            obj.update()
            self.parser.parse(obj.frame)
            obj.refresh_annotations()
            return obj
        elif type(obj) is sling.Frame:
            # Parse document frame and return parsed document.
            self.parser.parse(obj)
            return sling.Document(obj)
        else:
            # Create local store for new document.
            store = sling.Store(self.commons)

            # Tokenize text.
            doc = tokenize(str(obj), store=store, schema=self.schema)

            # Parse document.
            self.parser.parse(doc.frame)
            doc.refresh_annotations()
            return doc

コード例 #14

0

ファイルを表示

def pruneTriples(triple_file, output_file):
    """
    ******** This method needs to be called in a python env that has sling!!! *******
    Input: 1. name of file with tab sepperated SRO triples
           2. name of the output file
    Output: file containing triples where both entities have QCodes.
    """

    t1 = time.time()  # start the clock
    base_path = cfg['all']['base_path']
    kb = sling.Store()
    kb.load(base_path + "local/data/e/wiki/kb.sling")
    names = sling.PhraseTable(
        kb, base_path + "local/data/e/wiki/en/phrase-table.repo")
    kb.freeze()
    print("* Sling KB loaded in %0.3fs." % (time.time() - t1))

    verified_triples = []

    with open(triple_file, "r") as inFile:
        tsvreader = csv.reader(inFile, delimiter="\t")
        for triple in tsvreader:
            entity1 = triple[0]
            relation = triple[1]
            entity2 = triple[2]
            score = triple[3]
            hasBoth, qcode1, qcode2 = confirmEntities(names, entity1, entity2)
            if (hasBoth):
                q_triple = (qcode1, relation, qcode2, score)
                verified_triples.append(q_triple)

    with open(output_file, "w") as outFile:
        writer = csv.writer(outFile, delimiter='\t')
        for t in verified_triples:
            s = t[0]  # subject
            r = t[1]  # relation
            o = t[2]  # object
            c = t[3]  # confidence score
            writer.writerow([s, r, o, c])

コード例 #15

0

ファイルを表示

ファイル: prelim_ranker.py プロジェクト: rupprecht/sling

    def run(self, task):
        self.init(task)

        max_parses = int(task.param("max_parses"))
        reader = sling.RecordReader(task.input("input").name)
        writer = sling.RecordWriter(task.output("output").name)
        for index, (key, value) in enumerate(reader):
            store = sling.Store(self.kb)
            category = store.parse(value)
            document = sling.Document(category.document)

            # Score each parse.
            parse_with_score = self.score(category)

            # Keep only the top-k parses.
            ranked_parses = sorted(parse_with_score, key=lambda x: -x[1])
            if len(ranked_parses) > max_parses:
                dropped = len(ranked_parses) - max_parses
                ranked_parses = ranked_parses[0:max_parses]
                task.increment("parses-dropped", dropped)
                task.increment("categories-with-too-many-parses")

            # Compute signature for each parse and store it in the parse.
            for parse, _ in ranked_parses:
                tokens, span_signature = self.signature(document, parse)
                parse["signature"] = tokens
                for span in parse.spans:
                    if span in span_signature:
                        span["signature"] = span_signature[span]

            # Replace the current set of parses with the ranked list.
            del category["parse"]
            for parse, _ in ranked_parses:
                category.append("parse", parse)
            task.increment("parses-kept", len(ranked_parses))
            writer.write(key, category.data(binary=True))
        reader.close()
        writer.close()

コード例 #16

0

ファイルを表示

    def __init__(self,
                 frame=None,
                 store=None,
                 schema=None,
                 load_tokens=True,
                 load_mentions=True):
        # Create store, frame, and schema if missing.
        if frame != None:
            store = frame.store()
        if store == None:
            store = sling.Store()
        if schema == None:
            schema = DocumentSchema(store)
        if frame == None:
            frame = store.frame([(schema.isa, schema.document)])

        # Initialize document from frame.
        self.frame = frame
        self.schema = schema
        self._text = frame.get(schema.document_text, binary=True)
        self.tokens = []
        self.mentions = []
        self.themes = []
        self.tokens_dirty = False
        self.mentions_dirty = False
        self.themes_dirty = False

        if load_tokens:  # Get tokens.
            tokens = frame[schema.document_tokens]
            if tokens != None:
                for t in tokens:
                    token = self.get_word(t, schema, self._text)
                    self.tokens.append(token)

        if load_mentions:  # Get mentions.
            for m in frame(schema.document_mention):
                mention = Mention(schema, m)
                self.mentions.append(mention)

コード例 #17

0

ファイルを表示

ファイル: corpora.py プロジェクト: luisandresilva/sling

    def __init__(self, recordio, commons, schema=None, gold=False, loop=False):
        self.filename = recordio
        self.commons_owned = False
        if isinstance(commons, str):
            self.commons = sling.Store()
            self.commons.load(commons)
            self.commons_owned = True
        else:
            assert isinstance(commons, sling.Store)
            self.commons = commons

        if schema is None or self.commons_owned:
            schema = sling.DocumentSchema(self.commons)
            if self.commons_owned:
                self.commons.freeze()
        assert schema is not None
        self.schema = schema

        self.reader = sling.RecordReader(recordio)
        self.generator = None
        self.loop = loop
        self.generator = None
        self.set_gold(gold)

コード例 #18

0

ファイルを表示

    def __init__(self):
        # Initialize commons store with knowledge base.
        start = time.time()
        self.commons = sling.Store()
        self.commons.lockgc()
        self.commons.load(wikidir + "/kb.sling", snapshot=True)
        self.n_item_member = self.commons['/w/item/member']
        self.n_instance_of = self.commons['P31']
        self.n_wikimedia_category = self.commons['Q4167836']
        self.n_subject = self.commons['subject']
        self.extractor = sling.FactExtractor(self.commons)

        # Add category subject types.
        self.subjects = {}
        for subject, item in english_subject_types.iteritems():
            self.subjects[subject] = self.commons[item]

        # Add properties for subjects.
        self.subject_properties = []
        for p in subject_properties:
            self.subject_properties.append(self.commons[p])

        self.commons.freeze()
        end = time.time()
        print end - start, "secs loading commons"

        # Load phrase table.
        # TODO(ringgaard): Load language-dependent phrase table.
        start = time.time()
        self.phrasetab = sling.PhraseTable(self.commons,
                                           wikidir + "/en/phrase-table.repo")
        end = time.time()
        print end - start, "secs loading phrase table"

        # Open category member database.
        self.member_db = sling.RecordDatabase(wikidir +
                                              "/wikipedia-members.rec")

コード例 #19

0

ファイルを表示

def validate(commons,
             recordio_filename,
             output_recordio='',
             options=Options()):
    schema = None
    if not isinstance(commons, sling.Store):
        assert type(commons) is str
        filename = commons
        commons = sling.Store()
        commons.load(filename)
        schema = sling.DocumentSchema(commons)
        commons.freeze()
    else:
        schema = sling.DocumentSchema(commons)

    corpus = corpora.Corpora(recordio_filename, commons, schema)
    aggregate = Results(options)
    count = 0
    writer = None
    written = 0
    if output_recordio != '':
        writer = sling.RecordWriter(output_recordio)
    for document in corpus:
        results = _validate(count, document, options)
        aggregate.add(results)
        if not results.ok() and options.stop_on_first_bad_document:
            print("Stopping after first bad document as requested")
            break
        count += 1
        if writer and results.ok():
            writer.write('', document.frame.data(binary=True))
            written += 1

    if writer:
        writer.close()

    return aggregate, count, written

コード例 #20

0

ファイルを表示

ファイル: document.py プロジェクト: yespon/sling

    def __init__(self, frame=None, store=None, schema=None):
        # Create store, frame, and schema if missing.
        if frame != None:
            store = frame.store()
        if store == None:
            store = sling.Store()
        if schema == None:
            schema = DocumentSchema(store)
        if frame == None:
            frame = store.frame([(schema.isa, schema.document)])

        # Initialize document from frame.
        self.frame = frame
        self.schema = schema
        self._text = frame[schema.document_text]
        self.tokens = []
        self.mentions = []
        self.themes = []
        self.tokens_dirty = False
        self.mentions_dirty = False
        self.themes_dirty = False

        # Get tokens.
        tokens = frame[schema.document_tokens]
        if tokens != None:
            for t in tokens:
                token = Token(self, t, len(self.tokens))
                self.tokens.append(token)

        # Get mentions.
        for m in frame(schema.document_mention):
            mention = Mention(schema, m)
            self.mentions.append(mention)

        # Get themes.
        for theme in frame(schema.document_theme):
            self.themes.append(theme)

コード例 #21

0

ファイルを表示

def process_change(change):
    qid = change["title"]
    if qid.startswith("Property:"): qid = qid[9:]
    ts = change["timestamp"]
    kind = change["type"]

    if kind == "log" and change["log_action"] == "delete":
        # Delete item/property from database.
        try:
            print("[%d] %s DELETE" % (queue.qsize(), qid))
            reply = dbsession.delete(flags.arg.dburl + "/" + qid)
            reply.raise_for_status()
        except Exception as e:
            print("DB delete error:", e)
    elif kind == "edit":
        revision = change["revision"]["new"]
        store = sling.Store(commons)
        item = None

        m = redir_pat.fullmatch(change["comment"])
        redir = None
        if m != None:
            # Handle redirects by adding {=Q<old> +Q<new>} frames.
            redir = m.group(2)
            item = store.parse("{id: %s +%s}" % (qid, redir))
        else:
            # Fetch item.
            again = True
            while again:
                again = False
                try:
                    # Fetch item revision from Wikidata.
                    url = "%s?id=%s&revision=%d&format=json" % (
                        flags.arg.wiki_fetch_url, qid, revision)
                    reply = wdsession.get(url)
                    if reply.status_code == 429:
                        # Too many requests.
                        print("throttle down...")
                        time.sleep(30)
                        again = True
                    reply.raise_for_status()
                except Exception as e:
                    print("Error fetching item:", e, ":", change)
                    return

            # Convert item to SLING format.
            try:
                item, _ = wikiconv.convert_wikidata(store, reply.text)
            except Exception as e:
                print("Error converting item:", e, reply.text)
                return

        # Coalese strings.
        store.coalesce(flags.arg.string_buckets)

        # Save item in database.
        saved = False
        while not saved:
            try:
                reply = None
                reply = dbsession.put(flags.arg.dburl + "/" + qid,
                                      data=item.data(binary=True),
                                      headers={
                                          "Version": str(revision),
                                          "Mode": "ordered",
                                      })
                reply.raise_for_status()
                result = reply.headers["Result"]
                saved = True
            except Exception as e:
                print("DB error:", e, ":", reply.text if reply != None else "")
                time.sleep(30)

        if redir:
            print("[%d] %s REDIR %s" % (queue.qsize(), qid, redir))
        else:
            print("[%d] %d %s %s (%s)" %
                  (queue.qsize(), revision, item["id"], item["name"], result))

    # Update checkpoint.
    global num_changes
    num_changes += 1
    if flags.arg.checkpoint != None:
        if num_changes % flags.arg.checkpoint_interval == 0:
            dt = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(ts))
            print("CHECKPOINT", ts, dt)
            with open(flags.arg.checkpoint, 'w') as ckpt:
                ckpt.write(str(ts))

    sys.stdout.flush()

コード例 #22

0

ファイルを表示

flags.define("--threads",
             help="number of thread for worker pool",
             default=10,
             type=int,
             metavar="NUM")

flags.define("--qsize",
             help="queue size",
             default=1024,
             type=int,
             metavar="NUM")

flags.parse()

# Commons store for Wikidata converter.
commons = sling.Store()
wikiconv = sling.WikiConverter(commons)
commons.freeze()

# Global variables.
dbsession = requests.Session()
wdsession = requests.Session()
redir_pat = re.compile("\/\* wbcreateredirect:\d+\|\|(Q\d+)\|(Q\d+) \*\/")
num_changes = 0


# Fetch changed item and update database.
def process_change(change):
    qid = change["title"]
    if qid.startswith("Property:"): qid = qid[9:]
    ts = change["timestamp"]

コード例 #23

0

ファイルを表示

ファイル: wikimedia.py プロジェクト: superman901/sling

    def run(self, task):
        # Get parameters.
        language = task.param("language")

        # Load knowledge base.
        log.info("Load knowledge base")
        kb = sling.Store()
        kb.load(task.input("kb").name)

        n_infobox = kb["/wp/infobox"]
        n_page_item = kb["/wp/page/item"]
        n_file = kb["/wp/info/file"]
        n_media = kb["/wp/media"]

        image_fields = [
            (kb["/wp/info/image"], kb["/wp/info/caption"]),
            (kb["/wp/info/cover"], kb["/wp/info/caption"]),
            (kb["/wp/info/logo"], kb["/wp/info/logo_caption"]),
            (kb["/wp/info/photo"], kb["/wp/info/photo_caption"]),
            (kb["/wp/info/flag_image"], kb["/wp/info/flag_caption"]),
        ]

        p_media = kb["media"]
        p_id = kb["id"]
        p_is = kb["is"]
        p_imported_from = kb["P143"]
        p_media_legend = kb["P2096"]

        image_properties = [
            kb["P18"],  # image
            kb["P154"],  # logo image
            kb["P41"],  # flag image
        ]

        lang = kb["/lang/" + language]
        wikipedia_item = lang["/lang/wikilang/wikipedia"]

        docschema = sling.DocumentSchema(kb)

        kb.freeze()

        # Fetch media titles for Wikipedia from yesterday.
        log.info("Fetch local media titles")
        yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
        mediaurl = "https://dumps.wikimedia.org/other/mediatitles/%s/" \
          "%swiki-%s-all-media-titles.gz" % (yesterday, language, yesterday)
        r = urllib.request.urlopen(mediaurl)
        mediatitles = set(gzip.decompress(r.read()).decode().split('\n'))
        task.increment("local_media_files", len(mediatitles))

        # Open output file.
        fout = open(task.output("output").name, "w")

        # Process input articles.
        for res in task.inputs("input"):
            log.info("Extract media files from", res.name)
            for _, data in sling.RecordReader(res.name):
                # Read article into store.
                store = sling.Store(kb)
                doc = store.parse(data)
                task.increment("documents")

                # Find first infobox.
                infobox = None
                for theme in doc(docschema.document_theme):
                    if theme.isa(n_infobox):
                        infobox = theme
                        break
                if infobox is None: continue
                task.increment("infoboxes")

                # Find images in infobox.
                imagelist = []
                for n_image, n_caption in image_fields:
                    image = infobox[n_image]
                    caption = infobox[n_caption]
                    if image is None: continue

                    # Get image for repeated image field.
                    if type(image) is sling.Frame:
                        group = image
                        image = group[n_file]
                        caption = group[n_caption]
                        if image is None: continue

                    if "{" in image or "[" in image:
                        # Structured annotations.
                        annotations = sling.lex(image,
                                                store=store,
                                                schema=docschema)
                        for theme in annotations.themes:
                            if theme.isa(n_media):
                                image = theme[p_is]
                                if image is not None:
                                    imagelist.append((image, None))
                                    task.increment("structured_annotations")
                    else:
                        # Image filename.
                        imagelist.append((image, caption))
                if len(imagelist) == 0: continue

                # Process list of images for item.
                known_images = 0
                image_frames = []
                item = doc[n_page_item]
                if item is None: continue
                for image, caption in imagelist:
                    # Disregard direct URLs for now.
                    if image.startswith("http://") or \
                       image.startswith("https://") or \
                       image.startswith("//"):
                        task.increment("url_images")
                        continue

                    # Trim image name. Remove File: prefix.
                    colon = image.find(':')
                    if colon > 0 and colon < 10: image = image[colon + 1:]
                    image = titlecase(image.strip()).replace('_', ' ')
                    if len(image) == 0 or image in default_images:
                        task.increment("empty_images")
                        continue
                    if image.endswith("&lrm;"): image = image[:-5]
                    frag = image.find('#')
                    if frag > 0: image = image[:frag]
                    image = html.unescape(image)
                    image = urllib.parse.unquote(image)

                    # Discard media files with unknown or ignored extensions.
                    dot = image.rfind('.')
                    ext = image[dot:].lower() if dot > 0 else None
                    if ext in ignored_extensions:
                        task.increment("ignored_image_format")
                        continue
                    if ext not in known_extensions:
                        log.info("unknown format:", item.id, image)
                        task.increment("unknown_image_format")
                        continue

                    # Get item from KB and check if image is already known.
                    task.increment("images")
                    known = False
                    for prop in image_properties:
                        for img in item(prop):
                            img = kb.resolve(img)
                            if img == image: known = True
                            known_images += 1
                    if known:
                        task.increment("known_images")
                        continue
                    task.increment("new_images")

                    # Check if image is in local Wikipedia or Wikimedia Commons.
                    fn = image.replace(' ', '_')
                    if fn in mediatitles:
                        urlbase = "https://upload.wikimedia.org/wikipedia/" + language
                        task.increment("local_images")
                    else:
                        urlbase = "https://upload.wikimedia.org/wikipedia/commons"
                        task.increment("commons_images")
                        if known_images == 0:
                            task.increment("commons_imaged_items")

                    # Compute URL for image.
                    md5 = md5hash(fn)
                    fn = fn.replace("?", "%3F")
                    fn = fn.replace("+", "%2B")
                    fn = fn.replace("&", "%26")
                    url = "%s/%s/%s/%s" % (urlbase, md5[0], md5[0:2], fn)

                    # Create frame for item with media image.
                    slots = [
                        (p_is, url),
                        (p_imported_from, wikipedia_item),
                    ]
                    if caption != None:
                        capdoc = sling.lex(caption,
                                           store=store,
                                           schema=docschema)
                        captxt = capdoc.phrase(0, len(capdoc.tokens))
                        slots.append((p_media_legend, captxt))
                    image_frames.append(store.frame(slots))

                # Create item frame with extra image info.
                if len(image_frames) == 0: continue
                slots = [(p_id, item.id)]
                for image_frame in image_frames:
                    slots.append((p_media, image_frame))
                frame = store.frame(slots)
                fout.write(frame.data(utf8=True))
                fout.write("\n")
                if known_images == 0: task.increment("imaged_items")

        fout.close()

コード例 #24

0

ファイルを表示

    def run(self):
        month = "(" + "|".join(self.months.keys()) + ")"
        day = "(\d{1,2})"
        year = "(\d{4})"
        date = "(?:(?:" + day + " " + month + " " + year + ")|"
        date += "(?:" + month + " " + day + ", " + year + "))"
        date += "(?:[^)]+?)?"
        dates = date + u"\s*-+\s*" + date
        dates = u"(?:(?:(?:born|b\.|n\xe9e),? ([^0-9)]*?)" + date + \
          "(?:(?:died|d\.),? [^0-9)]*?" + date + ")?)|(?:" + dates + "))"
        pat = "(?:[^(]|\([^0-9]*\))*?\([^0-9)]*?" + dates + "\s*\)"
        rec = re.compile(pat)

        self.out_file = "local/data/e/wikibot/birth-death-dates.rec"
        record_file = sling.RecordWriter(self.out_file)
        records = 0
        store = sling.Store(self.kb)

        for i in range(10):
            i_file = "local/data/e/wiki/en/documents-0000" + str(
                i) + "-of-00010.rec"
            print i_file, records
            for (item_id, record) in sling.RecordReader(i_file):
                item = self.kb[item_id]
                if self.human not in item(self.instanceof): continue
                if self.precise_date(item(self.date_of_birth)) and \
                   self.precise_date(item(self.date_of_death)):
                    continue
                parsed_record = sling.Store().parse(record)
                doc = sling.Document(parsed_record)
                raw_text = parsed_record['text']
                if len(raw_text) == 0: continue
                start_index = raw_text.find("<b>") + len("<b>")
                first = 1
                while first < len(doc.tokens) and \
                      doc.tokens[first].start <= start_index:
                    first += 1
                last = first
                while last < len(doc.tokens) and doc.tokens[last].brk < 3:
                    last += 1
                text = doc.phrase(max(0, first - 1),
                                  min(len(doc.tokens), last + 15))
                m = rec.match(text)
                if m is None: continue
                if text.find("(baptised") >= 0 or text.find("throne") >= 0:
                    continue
                if text.find("(baptized") >= 0 or text.find("partner") >= 0:
                    continue
                if m.group(2) or m.group(5):
                    first = self.date_from_match(1, m)
                    if first.year < 1753:
                        continue  # possibly Julian calendar date
                    if m.group(8) or m.group(11):
                        second = self.date_from_match(7, m)
                        if second.year < 1753:
                            continue  # possibly Julian calendar date
                        facts = store.frame({
                            self.date_of_birth: first.value(),
                            self.date_of_death: second.value()
                        })
                    else:
                        # Only one date match
                        mg1 = m.group(1)
                        dob = item(self.date_of_birth)
                        dod = item(self.date_of_death)
                        if mg1 and max(mg1.find("died"), mg1.find("d.")) >= 0:
                            # death date only
                            if self.precise_date(dod): continue
                            if self.same_year(first.year, dob):
                                continue  # b&d too close
                            facts = store.frame({
                                self.date_of_death:
                                first.value(),
                            })
                        else:
                            # birth date only
                            if self.precise_date(dob): continue
                            if self.same_year(first.year, dod):
                                continue  # b&d too close
                            facts = store.frame({
                                self.date_of_birth:
                                first.value(),
                            })
                else:
                    first = self.date_from_match(13, m)
                    second = self.date_from_match(19, m)
                    if min(first.year, second.year) < 1753:
                        continue  # possibly Julian
                    facts = store.frame({
                        self.date_of_birth: first.value(),
                        self.date_of_death: second.value()
                    })
                records += 1
                provenance = store.frame({
                    self.url:
                    parsed_record['url'],
                    self.method:
                    "English Wikipedia dates for '" + str(item.name) + "'"
                })
                fact = store.frame({
                    self.item: item,
                    self.facts: facts,
                    self.provenance: provenance
                })
                record_file.write(item.id, fact.data(binary=True))
        record_file.close()
        print records, "birth/death date records written to file:", self.out_file

コード例 #25

0

ファイルを表示

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Import FactGrid and convert to SLING format."""

import sling
import gzip

# Map ids into FactGrid namespace.
def convert_id(idstr):
  if idstr[0] == "P" or idstr[0] == "Q": return "P8168/" + idstr;
  return idstr

# Initialize commons store.
commons = sling.Store()
n_id = commons["id"]
n_is = commons["is"]
n_isa = commons["isa"]
n_qid = commons["/w/qid"]
n_pid = commons["P343"]
n_property = commons["/w/property"]
n_fg_item_id = commons["P8168"]

wikiconv = sling.WikiConverter(commons, "en")

commons.freeze()

# Read all items from FactGrid dump.
fin = gzip.open("data/c/wikidata/factgrid.json.gz")
fitem = sling.RecordWriter("data/e/factgrid/factgrid-items.rec")

コード例 #26

0

ファイルを表示

  def compare(arg):
    base_reader = sling.RecordReader(arg.base)
    expt_reader = sling.RecordReader(arg.expt)

    commons = sling.Store()
    commons.load(arg.commons)
    schema = sling.DocumentSchema(commons)
    commons.freeze()
    
    store = sling.Store(commons)
    index = -1
    for (_, base_val), (_, expt_val) in zip(base_reader, expt_reader):
      index += 1
      base_doc = sling.Document(frame=store.parse(base_val), schema=schema)
      expt_doc = sling.Document(frame=store.parse(expt_val), schema=schema)

      # Basic checks.
      base = base_doc.frame["trace"]
      expt = expt_doc.frame["trace"]
      if base is None and expt_doc is not None:
        checker.error('No trace in base document at index %d' % index)
      elif base is not None and expt_doc is None:
        checker.error('No trace in expt document at index %d' % index)
      if base is None:
        continue

      # Traces should be over the same token range.
      checker = Checker(index, base_doc, expt_doc, arg.diff)
      checker.check_eq(base["begin"], expt["begin"], "Trace Begin")
      checker.check_eq(base["end"], expt["end"], "Trace End")

      # Check LSTM features.
      base_lstm = base["/trace/lstm_features"]
      expt_lstm = expt["/trace/lstm_features"]
      checker.check_eq(len(base_lstm), len(expt_lstm), "LSTM Features Length")
      for i in range(len(base_lstm)):
        checker.frame_eq(base_lstm[i], expt_lstm[i], \
          "LSTM features for token %d (%s)" % (i, base_doc.tokens[i].word))

      # Check steps.
      base_steps = base["/trace/steps"]
      expt_steps = expt["/trace/steps"]
      min_steps = min(len(base_steps), len(expt_steps))
      for i in range(min_steps):
        message = "Step %d's current token index" % i
        checker.check_eq(base_steps[i]["/trace/current"], \
          expt_steps[i]["/trace/current"], message)

        # Check FF features for the step.
        base_ff = base_steps[i]["/trace/ff_features"]
        expt_ff = expt_steps[i]["/trace/ff_features"]
        checker.check_eq(len(base_ff), len(expt_ff), \
          "# of FF features for step %d" % i)

        base_dict = {f["/trace/feature"] : f["/trace/values"] for f in base_ff}
        expt_dict = {f["/trace/feature"] : f["/trace/values"] for f in expt_ff}
        for k, v in base_dict.items():
          checker.check_eq(k in expt_dict, True, \
            "Step %d: FF feature %s not in expt" % (i, k))
          checker.check_eq(v, expt_dict[k], \
            "Step %d: FF feature %s has a different value in expt" % (i, k))
        for k, v in expt_dict.items():
          checker.check_eq(k in base_dict, True, \
            "Step %d: FF feature %s not in base" % (i, k))

        # Check action(s) in the step.
        base_actions = base_steps[i]["/trace/actions"]
        expt_actions = expt_steps[i]["/trace/actions"]
        for idx in range(min(len(base_actions), len(expt_actions))):
          checker.frame_eq(base_actions[idx]["/trace/predicted"], \
            expt_actions[idx]["/trace/predicted"],
            "Step %d, predicted action %d" % (i, idx),
            ["/trace/_str"])
          checker.frame_eq(base_actions[idx]["/trace/final"], \
            expt_actions[idx]["/trace/final"],
            "Step %d, final action %d" % (i, idx),
            ["/trace/_str"])

        # There should be the same number of actions in the step.
        checker.check_eq(len(base_actions), len(expt_actions), \
          "Step %d: # of actions" % i)

      # There should be the same number of steps.
      checker.check_eq(len(base_steps), len(expt_steps), "# of Steps")

    base_reader.close()
    expt_reader.close()

コード例 #27

0

ファイルを表示

ファイル: regauth.py プロジェクト: superman901/sling

#
# Fields:
# 0: Registration Authority Code
# 1: Country
# 2: Country Code
# 3: Jurisdiction (country or region)
# 4: International name of Register
# 5: Local name of Register
# 6: International name of organisation responsible for the Register
# 7: Local name of organisation responsible for the Register
# 8: Website
# 9: Date IP disclaimer
# 10: Comments
# 11: End Date

kb = sling.Store()
kb.load("data/e/kb/kb.sling")
aliases = sling.PhraseTable(kb, "data/e/kb/en/phrase-table.repo")


def resolve_name(name):
    for item in aliases.lookup(name):
        return item
    return None


reader = csv.reader(open("data/c/lei/2019-12-05_ra-list-v1.5.csv", "r"))
reader.__next__()

for row in reader:
    slots = [("registration_authority_code", row[0]), ("country_name", row[1]),

コード例 #28

0

ファイルを表示

ファイル: generator.py プロジェクト: pvk444/sling

    def run(self, task):
        self.init(task)

        writer = sling.RecordWriter(task.output("output").name)
        rejected = sling.RecordWriter(task.output("rejected").name)
        inputs = [t.name for t in task.inputs("items")]

        for filename in inputs:
            reader = sling.RecordReader(filename)
            for index, (key, value) in enumerate(reader):
                store = sling.Store(self.kb)
                frame = store.parse(value)

                # Only process category items.
                if not self.is_category(frame):
                    rejected.write(key, "not_category")
                    continue

                # See if the category should be skipped.
                members = self.get_members(frame)
                reject, reason = self.reject(key, frame, members)
                if reject:
                    task.increment("skipped_categories/" + reason)
                    rejected.write(key, reason)
                    continue

                # First, collect the targets of all facts of all category members.
                qp_counts = self.qid_pid_counts(store, members)

                # Next, tokenize the category title.
                title = self.get_title(frame)
                colon = title.find(':')
                title = title[colon + 1:]
                document = sling.tokenize(title, store)

                # Next, find matches for all spans. These are reported as a list,
                # where ith item = spans that begin at token i (possibly an empty list).
                begin_to_spans = self.compute_spans(document, qp_counts)

                # Construct maximal parses with non-overlapping spans.
                parses = self.construct_parses(begin_to_spans)

                # Post-process parses.
                parses = self.post_process(parses)
                if len(parses) == 0 or len(parses) == 1 and len(
                        parses[0]) == 0:
                    task.increment("skipped_categories/no_parses")
                    rejected.write(key, "no_parses")
                    continue

                # Write parses as frames.
                frame = store.frame({"name": title, "members": members})
                frame["document"] = document.frame
                for parse in parses:
                    span_array = store.array(len(parse))
                    for i, span in enumerate(parse):
                        span_array[i] = store.frame({
                            "begin": span.begin,
                            "end": span.end,
                            "qid": span.qid,
                            "prior": span.prior,
                            "pids": list(span.pids),
                            "count": span.count
                        })
                    parse_frame = store.frame({"spans": span_array})
                    frame.append("parse", parse_frame)
                writer.write(key, frame.data(binary=True))
                task.increment("categories_accepted")

                # Compute histogram over number of parses.
                for b in self.num_parses_bins:
                    if len(parses) <= b:
                        task.increment("#parses <= %d" % b)
                if self.num_parses_bins[-1] < len(parses):
                    task.increment("#parses > %d" % self.num_parses_bins[-1])

            reader.close()
        writer.close()
        rejected.close()

コード例 #29

0

ファイルを表示

ファイル: fact_matcher.py プロジェクト: yespon/sling

def test_fact_matcher():
    RED = "\033[1;31m"
    GREEN = "\033[0;32m"
    RESET = "\033[0;0m"

    def error(entry, message):
        sys.stdout.write(RED)
        print "[FAILED] ",
        sys.stdout.write(RESET)
        print entry, ":", message

    def success(entry):
        sys.stdout.write(GREEN)
        print "[SUCCESS] ",
        sys.stdout.write(RESET)
        print entry

    kb = load_kb("local/data/e/wiki/kb.sling")
    extractor = sling.api.FactExtractor(kb)
    matcher = FactMatcher(kb, extractor)

    # Test cases.
    tuples = []

    # Adds the given test case and its reverse test case too (if possible).
    def add(pid, existing, proposed, match_type):
        tuples.append((pid, existing, proposed, match_type))

        # Add the reverse case.
        if match_type != FactMatchType.NEW and existing != proposed:
            rev_type = match_type
            if match_type == FactMatchType.SUBSUMED_BY_EXISTING:
                rev_type = FactMatchType.SUBSUMES_EXISTING
            if match_type == FactMatchType.SUBSUMES_EXISTING:
                rev_type = FactMatchType.SUBSUMED_BY_EXISTING
            tuples.append((pid, proposed, existing, rev_type))

    # Place of birth, Kapiolani Medical Center, Honolulu.
    add("P19", "Q6366688", "Q18094", FactMatchType.SUBSUMES_EXISTING)

    # Place of birth, Kapiolani Medical Center, US.
    add("P19", "Q6366688", "Q30", FactMatchType.SUBSUMES_EXISTING)

    # Place of birth, <no existing value>, US.
    add("P19", "", "Q30", FactMatchType.NEW)

    # Place of birth, US, US.
    add("P19", "Q30", "Q30", FactMatchType.EXACT)

    # Place of birth, Honolulu, Chicago.
    add("P19", "Q18094", "Q1297", FactMatchType.CONFLICT)

    # Children, Malia Obama, Sasha Obama.
    add("P40", "Q15070044", "Q15070048", FactMatchType.ADDITIONAL)

    # Date-valued properties: int values.
    # Note: P585 = point in time (unique valued), P580 = start time (non unique)
    add("P585", 1961, 19610804, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", 1961, 196108, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", 1961, 1961, FactMatchType.EXACT)
    add("P585", 1961, 196,
        FactMatchType.SUBSUMES_EXISTING)  # 196 = 196X (decade)
    add("P585", 1961, 19,
        FactMatchType.SUBSUMES_EXISTING)  # 19 = 19XX (century)
    add("P585", 1961, 1,
        FactMatchType.SUBSUMES_EXISTING)  # 1 = 1XXX (millenium)
    add("P585", 1962, 19610804, FactMatchType.CONFLICT)
    add("P585", 1962, 196108, FactMatchType.CONFLICT)
    add("P585", 1962, 1961, FactMatchType.CONFLICT)
    add("P580", 1961, 1962, FactMatchType.ADDITIONAL)

    # Date-valued properties: string values.
    add("P585", "1961", "1961-08-04", FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", "1961", "1961-08", FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", "1961", "1961", FactMatchType.EXACT)
    add("P585", "1961", "196*", FactMatchType.SUBSUMES_EXISTING)  # decade
    add("P585", "1961", "19**", FactMatchType.SUBSUMES_EXISTING)  # century
    add("P585", "1961", "1***", FactMatchType.SUBSUMES_EXISTING)  # millenium
    add("P585", "1962", "1961-08-04", FactMatchType.CONFLICT)
    add("P585", "1962", "1961-08", FactMatchType.CONFLICT)
    add("P585", "1962", "1961", FactMatchType.CONFLICT)
    add("P580", "1961", "1962-08", FactMatchType.ADDITIONAL)

    # Date-valued properties: QID values. These are only available for years,
    # decades, and millenia.
    q1961 = "Q3696"
    q1962 = "Q2764"
    q196x = "Q35724"
    q197x = "Q35014"
    q19xx = "Q6927"
    q1xxx = "Q25860"
    add("P585", q196x, q1961, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q1xxx, q1961, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q1961, q1961, FactMatchType.EXACT)
    add("P585", q1961, q1962, FactMatchType.CONFLICT)
    add("P585", q196x, q197x, FactMatchType.CONFLICT)
    add("P585", q19xx, q197x, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P580", q1961, q197x, FactMatchType.ADDITIONAL)

    # Date-valued properties: proposed and existing values have different types.
    add("P585", q1961, 1961, FactMatchType.EXACT)
    add("P585", q196x, 196, FactMatchType.EXACT)
    add("P585", q19xx, 19, FactMatchType.EXACT)
    add("P585", q1xxx, 1, FactMatchType.EXACT)
    add("P585", q196x, 1961, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q1961, 19610804, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q1961, 19, FactMatchType.SUBSUMES_EXISTING)
    add("P585", q1961, "1961", FactMatchType.EXACT)
    add("P585", q196x, "196*", FactMatchType.EXACT)
    add("P585", q19xx, "19**", FactMatchType.EXACT)
    add("P585", q196x, "1961", FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q196x, "1961-08-04", FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q1961, "196*", FactMatchType.SUBSUMES_EXISTING)
    add("P585", "", "196*", FactMatchType.NEW)
    add("P585", q1961, "1962", FactMatchType.CONFLICT)
    add("P585", 1963, "1962", FactMatchType.CONFLICT)
    add("P580", q1961, "1962", FactMatchType.ADDITIONAL)
    add("P580", 1963, "1962", FactMatchType.ADDITIONAL)

    # Genre, melodrama, drama.
    add("P136", "Q191489", "Q21010853", FactMatchType.SUBSUMES_EXISTING)

    # Genre, trip-hop, electronic music.
    add("P136", "Q205560", "Q9778", FactMatchType.SUBSUMES_EXISTING)

    # Genre, rock and roll, electronic music.
    add("P136", "Q7749", "Q9778", FactMatchType.ADDITIONAL)

    # Educated at, Harvard Law School, Harvard University.
    add("P69", "Q49122", "Q13371", FactMatchType.SUBSUMES_EXISTING)

    # Educated at, Harvard Law School, Yale University.
    add("P69", "Q49122", "Q49112", FactMatchType.ADDITIONAL)

    # Employer, Airbus, Airbus SE.
    add("P108", "Q67", "Q2311", FactMatchType.SUBSUMES_EXISTING)

    # Employer, Airbus, Boeing.
    add("P108", "Q67", "Q66", FactMatchType.ADDITIONAL)

    # Occupation, sports cyclist, cyclist.
    add("P106", "Q2309784", "Q2125610", FactMatchType.SUBSUMES_EXISTING)

    # Occupation, sports cyclist, cricketer.
    add("P106", "Q2309784", "Q12299841", FactMatchType.ADDITIONAL)

    store = sling.Store(kb)
    total_successes = 0
    for entry in tuples:
        pid, existing, proposed, expected = entry
        if pid not in kb:
            error(entry, "%s not in KB" % pid)
            continue

        pid = kb[pid]
        if isinstance(existing, str) and existing != "" and existing in kb:
            existing = kb[existing]
        if isinstance(proposed, str) and proposed in kb:
            proposed = kb[proposed]

        if existing == "":
            existing = []
        else:
            existing = [existing]
        actual = matcher.match_type(store, pid, existing, proposed)
        if actual == expected:
            success(entry)
            total_successes += 1
        else:
            error(entry,
                  "Got %s, but expected %s" % (actual.name, expected.name))
    print "Total successful tests: %d out of %d" % (total_successes,
                                                    len(tuples))

コード例 #30

0

ファイルを表示

ファイル: lei.py プロジェクト: entn-at/sling-1

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Convert GLEIF register to SLING."""

import zipfile
import csv
import sling
import sling.dataset.bizreg

# Load KB.
print("Loading KB")
kb = sling.Store()
kb.load("data/e/kb/kb.sling")

n_id = kb["id"]
n_is = kb["is"]
n_isa = kb["isa"]
n_name = kb["name"]
n_instance_of = kb["P31"]
n_country_code = kb["P297"]
n_region_code = kb["P300"]
n_organization = kb["Q43229"]
n_opencorporates_id = kb["P1320"]
n_country = kb["P17"]
n_street_address = kb["P6375"]
n_postal_code = kb["P281"]
n_headquarters = kb["P159"]