def data_preparation(args):

    dataset = ['train', 'valid']
    wiki_kb = {}
    src = {}
    tgt = {}

    arguments = ""
    for set_info in dataset:
        wiki_kb[set_info] = load_kb(os.path.join(args.data_dir,
                                                 "{}.wiki".format(set_info)),
                                    verbose=False)
        src[set_info], tgt[set_info] = generate_tmp_file(os.path.join(
            args.data_dir, "{}.pm".format(set_info)),
                                                         wiki_kb[set_info],
                                                         verbose=False)
        arguments += "-{}_src {} -{}_tgt {} ".format(set_info, src[set_info],
                                                     set_info, tgt[set_info])

    script = "python {}/preprocess.py {} -save_data {} -src_seq_length 5000".format(
        OpenNMT_dir, arguments, args.data_out)
    if args.share_vocab:
        script += " -share_vocab"

    print(script)
    os.system(script)
    for set_info in dataset:
        unlink_tmp_file(src[set_info])
        unlink_tmp_file(tgt[set_info])
Ejemplo n.º 2
0
    def init(self, task):
        self.kb = load_kb(task)
        self.extractor = sling.api.FactExtractor(self.kb)
        self.h_name = self.kb['name']

        # Custom type ordering for building a taxonomy.
        taxonomy_types = [
            'Q215627',  # person
            'Q95074',  # fictional character
            'Q729',  # animal
            'Q4164871',  # position
            'Q12737077',  # occupation
            'Q216353',  # title
            'Q618779',  # award
            'Q27020041',  # sports season
            'Q4438121',  # sports organization
            'Q215380',  # band
            'Q2385804',  # educational institution
            'Q783794',  # company
            'Q41710',  # ethnic group  (NEW TYPE)
            'Q6256',  # country  (NEW TYPE)
            'Q17334923',  # location
            'Q43229',  # organization
            'Q431289',  # brand
            'Q2188189',  # musical work
            'Q571',  # book
            'Q732577',  # publication
            'Q11424',  # film
            'Q15416',  # television program
            'Q12136',  # disease
            'Q16521',  # taxon
            'Q5058355',  # cellular component
            'Q7187',  # gene
            'Q11173',  # chemical compound
            'Q811430',  # construction
            'Q618123',  # geographical object
            'Q1656682',  # event
            'Q101352',  # family name
            'Q202444',  # given name
            'Q577',  # year
            'Q186081',  # time interval
            'Q11563',  # number
            'Q17376908',  # languoid
            'Q1047113',  # specialty  (REORDERED)
            'Q968159',  # art movement (NEW TYPE)
            'Q483394',  # genre  (REORDERED)
            'Q47574',  # unit of measurement (REPLACES unit)
            'Q39875001',  # measure
            'Q3695082',  # sign
            'Q2996394',  # biological process
            'Q11410',  # game
            'Q7397',  # software
            'Q838948',  # work of art
            'Q47461344',  # written work
            'Q28877',  # goods
            'Q15401930',  # product
            'Q121769',  # reference
            'Q1190554'  # occurrence
        ]
        self.taxonomy = self.extractor.taxonomy(taxonomy_types)
Ejemplo n.º 3
0
    def init(self, task):
        self.kb = load_kb(task)
        self.names = sling.PhraseTable(self.kb,
                                       task.input("phrase-table").name)

        self.min_members = int(task.param("min_members"))
        self.num_parses_bins = [1, 2, 3, 5, 10, 20, 50, 100, 200]

        # Lookup some handles in advance.
        self.h_language = self.lookup("/lang/" + task.param("language"))
        self.h_lang = self.lookup("lang")
        self.main_topic = self.lookup("P301")  # present in topical categories
        self.h_member = self.lookup("/w/item/member")
        self.h_instanceof = self.lookup('P31')
        self.h_subclassof = self.lookup('P279')
        self.h_category = self.lookup('Q4167836')
        self.h_category_contains = self.lookup('P4224')
        self.english = task.param("language") == "en"

        # The following kinds of categories won't be processed.
        self.uninteresting_categories = set([
            self.lookup('Q20769287'),  # disambiguation category
            self.lookup('Q15407973'),  # list category
            #self.lookup('Q56428020'),  # template category
            self.lookup('Q23894233'),  # stub category
            self.lookup('Q24046192'),  # admin category
            self.lookup('Q15647814'),  # user category
            self.lookup('Q20010800'),  # user language category
            self.lookup('Q30432511'),  # meta category
            self.lookup('Q13331174')  # navbox category
        ])

        # These pids will not be considered as resolution for spans.
        self.pids_to_ignore = set([
            self.h_instanceof,  # P31 = instance of
            self.lookup('P279'),  # P279 = subclass of
            self.lookup('P971'),  # P971 = category combines topics
            self.lookup('P4224'),  # P4224 = category contains
        ])

        # These QIDs will not be considered as resolutions for spans.
        self.base_qids = set([
            self.lookup('Q5'),  # human
            self.lookup('Q215627'),  # person
            self.lookup('Q17334923'),  # location
            self.lookup('Q811430'),  # construction
            self.lookup('Q43229'),  # organization
            self.lookup('Q2385804'),  # educational institution
            self.lookup('Q294163'),  # public institution
            self.lookup('Q15401930'),  # product
            self.lookup('Q12737077'),  # occupation
            self.lookup('Q192581'),  # job
            self.lookup('Q4164871'),  # position
            self.lookup('Q216353')  # title
        ])
        self.extractor = sling.api.FactExtractor(self.kb)
def generate(args):

    wiki_kb = load_kb(
        os.path.join(args.data_dir, "{}.wiki".format(args.dataset)))
    src, tgt = generate_tmp_file(
        os.path.join(args.data_dir, "{}.pm".format(args.dataset)), wiki_kb)

    script = "python {}/translate.py -model {} -src {} -output {} -gpu 0 -replace_unk -verbose".format(
        OpenNMT_dir, args.model, src, args.out)

    os.system(script)

    unlink_tmp_file(src)
    unlink_tmp_file(tgt)
Ejemplo n.º 5
0
def shell():
    kb = load_kb("local/data/e/wiki/kb.sling")
    extractor = sling.api.FactExtractor(kb)
    matcher = FactMatcher(kb, extractor)

    parses = "local/data/e/wikicat/filtered-parses.rec"
    db = sling.RecordDatabase(parses)

    while True:
        item = raw_input("Enter item or category QID:")

        # See if a category QID was entered, if so, compute and output match
        # statistics for all its parses.
        value = db.lookup(item)
        if value is not None:
            store = sling.Store(kb)
            category = store.parse(value)
            output = matcher.for_parses(category, store, max_evidences=4)
            print "%s = %s (%d members)" % \
              (item, category.name, len(category.members))
            for idx, (parse, match) in enumerate(zip(category("parse"),
                                                     output)):
                print "%d. %s" % (idx, ' '.join(parse.signature))
                for span, span_match in zip(parse.spans, match):
                    print "  %s = (%s=%s) : %s" % \
                      (span.signature, str(list(span.pids)), span.qid, \
                       str(span_match))
                print ""
            print ""
            continue

        item = kb[item]

        pids = raw_input("Enter [comma-separated] pid(s):")
        pids = filter(None, pids.replace(' ', '').split(','))
        for pid in pids:
            assert pid in kb, pid
        pids = [kb[p] for p in pids]

        qid = raw_input("Enter qid:")
        assert qid in kb, qid
        qid = kb[qid]

        output = matcher.for_item(item, pids, qid)
        print item, "(" + item.name + ") :", \
          output[0].name, "evidence: ", output[1]
        print ""
Ejemplo n.º 6
0
def test_fact_matcher():
    RED = "\033[1;31m"
    GREEN = "\033[0;32m"
    RESET = "\033[0;0m"

    def error(entry, message):
        sys.stdout.write(RED)
        print "[FAILED] ",
        sys.stdout.write(RESET)
        print entry, ":", message

    def success(entry):
        sys.stdout.write(GREEN)
        print "[SUCCESS] ",
        sys.stdout.write(RESET)
        print entry

    kb = load_kb("local/data/e/wiki/kb.sling")
    extractor = sling.api.FactExtractor(kb)
    matcher = FactMatcher(kb, extractor)

    # Test cases.
    tuples = []

    # Adds the given test case and its reverse test case too (if possible).
    def add(pid, existing, proposed, match_type):
        tuples.append((pid, existing, proposed, match_type))

        # Add the reverse case.
        if match_type != FactMatchType.NEW and existing != proposed:
            rev_type = match_type
            if match_type == FactMatchType.SUBSUMED_BY_EXISTING:
                rev_type = FactMatchType.SUBSUMES_EXISTING
            if match_type == FactMatchType.SUBSUMES_EXISTING:
                rev_type = FactMatchType.SUBSUMED_BY_EXISTING
            tuples.append((pid, proposed, existing, rev_type))

    # Place of birth, Kapiolani Medical Center, Honolulu.
    add("P19", "Q6366688", "Q18094", FactMatchType.SUBSUMES_EXISTING)

    # Place of birth, Kapiolani Medical Center, US.
    add("P19", "Q6366688", "Q30", FactMatchType.SUBSUMES_EXISTING)

    # Place of birth, <no existing value>, US.
    add("P19", "", "Q30", FactMatchType.NEW)

    # Place of birth, US, US.
    add("P19", "Q30", "Q30", FactMatchType.EXACT)

    # Place of birth, Honolulu, Chicago.
    add("P19", "Q18094", "Q1297", FactMatchType.CONFLICT)

    # Children, Malia Obama, Sasha Obama.
    add("P40", "Q15070044", "Q15070048", FactMatchType.ADDITIONAL)

    # Date-valued properties: int values.
    # Note: P585 = point in time (unique valued), P580 = start time (non unique)
    add("P585", 1961, 19610804, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", 1961, 196108, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", 1961, 1961, FactMatchType.EXACT)
    add("P585", 1961, 196,
        FactMatchType.SUBSUMES_EXISTING)  # 196 = 196X (decade)
    add("P585", 1961, 19,
        FactMatchType.SUBSUMES_EXISTING)  # 19 = 19XX (century)
    add("P585", 1961, 1,
        FactMatchType.SUBSUMES_EXISTING)  # 1 = 1XXX (millenium)
    add("P585", 1962, 19610804, FactMatchType.CONFLICT)
    add("P585", 1962, 196108, FactMatchType.CONFLICT)
    add("P585", 1962, 1961, FactMatchType.CONFLICT)
    add("P580", 1961, 1962, FactMatchType.ADDITIONAL)

    # Date-valued properties: string values.
    add("P585", "1961", "1961-08-04", FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", "1961", "1961-08", FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", "1961", "1961", FactMatchType.EXACT)
    add("P585", "1961", "196*", FactMatchType.SUBSUMES_EXISTING)  # decade
    add("P585", "1961", "19**", FactMatchType.SUBSUMES_EXISTING)  # century
    add("P585", "1961", "1***", FactMatchType.SUBSUMES_EXISTING)  # millenium
    add("P585", "1962", "1961-08-04", FactMatchType.CONFLICT)
    add("P585", "1962", "1961-08", FactMatchType.CONFLICT)
    add("P585", "1962", "1961", FactMatchType.CONFLICT)
    add("P580", "1961", "1962-08", FactMatchType.ADDITIONAL)

    # Date-valued properties: QID values. These are only available for years,
    # decades, and millenia.
    q1961 = "Q3696"
    q1962 = "Q2764"
    q196x = "Q35724"
    q197x = "Q35014"
    q19xx = "Q6927"
    q1xxx = "Q25860"
    add("P585", q196x, q1961, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q1xxx, q1961, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q1961, q1961, FactMatchType.EXACT)
    add("P585", q1961, q1962, FactMatchType.CONFLICT)
    add("P585", q196x, q197x, FactMatchType.CONFLICT)
    add("P585", q19xx, q197x, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P580", q1961, q197x, FactMatchType.ADDITIONAL)

    # Date-valued properties: proposed and existing values have different types.
    add("P585", q1961, 1961, FactMatchType.EXACT)
    add("P585", q196x, 196, FactMatchType.EXACT)
    add("P585", q19xx, 19, FactMatchType.EXACT)
    add("P585", q1xxx, 1, FactMatchType.EXACT)
    add("P585", q196x, 1961, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q1961, 19610804, FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q1961, 19, FactMatchType.SUBSUMES_EXISTING)
    add("P585", q1961, "1961", FactMatchType.EXACT)
    add("P585", q196x, "196*", FactMatchType.EXACT)
    add("P585", q19xx, "19**", FactMatchType.EXACT)
    add("P585", q196x, "1961", FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q196x, "1961-08-04", FactMatchType.SUBSUMED_BY_EXISTING)
    add("P585", q1961, "196*", FactMatchType.SUBSUMES_EXISTING)
    add("P585", "", "196*", FactMatchType.NEW)
    add("P585", q1961, "1962", FactMatchType.CONFLICT)
    add("P585", 1963, "1962", FactMatchType.CONFLICT)
    add("P580", q1961, "1962", FactMatchType.ADDITIONAL)
    add("P580", 1963, "1962", FactMatchType.ADDITIONAL)

    # Genre, melodrama, drama.
    add("P136", "Q191489", "Q21010853", FactMatchType.SUBSUMES_EXISTING)

    # Genre, trip-hop, electronic music.
    add("P136", "Q205560", "Q9778", FactMatchType.SUBSUMES_EXISTING)

    # Genre, rock and roll, electronic music.
    add("P136", "Q7749", "Q9778", FactMatchType.ADDITIONAL)

    # Educated at, Harvard Law School, Harvard University.
    add("P69", "Q49122", "Q13371", FactMatchType.SUBSUMES_EXISTING)

    # Educated at, Harvard Law School, Yale University.
    add("P69", "Q49122", "Q49112", FactMatchType.ADDITIONAL)

    # Employer, Airbus, Airbus SE.
    add("P108", "Q67", "Q2311", FactMatchType.SUBSUMES_EXISTING)

    # Employer, Airbus, Boeing.
    add("P108", "Q67", "Q66", FactMatchType.ADDITIONAL)

    # Occupation, sports cyclist, cyclist.
    add("P106", "Q2309784", "Q2125610", FactMatchType.SUBSUMES_EXISTING)

    # Occupation, sports cyclist, cricketer.
    add("P106", "Q2309784", "Q12299841", FactMatchType.ADDITIONAL)

    store = sling.Store(kb)
    total_successes = 0
    for entry in tuples:
        pid, existing, proposed, expected = entry
        if pid not in kb:
            error(entry, "%s not in KB" % pid)
            continue

        pid = kb[pid]
        if isinstance(existing, str) and existing != "" and existing in kb:
            existing = kb[existing]
        if isinstance(proposed, str) and proposed in kb:
            proposed = kb[proposed]

        if existing == "":
            existing = []
        else:
            existing = [existing]
        actual = matcher.match_type(store, pid, existing, proposed)
        if actual == expected:
            success(entry)
            total_successes += 1
        else:
            error(entry,
                  "Got %s, but expected %s" % (actual.name, expected.name))
    print "Total successful tests: %d out of %d" % (total_successes,
                                                    len(tuples))
Ejemplo n.º 7
0
 def init(self, task):
     self.kb = load_kb(task)
     self.extractor = sling.FactExtractor(self.kb)
     self.matcher = FactMatcher(self.kb, self.extractor)