Example #1
0
    def __init__(self):
        self.kb = sling.Store()
        self.kb.lockgc()
        self.kb.load("local/data/e/wiki/kb.sling", snapshot=True)
        self.instanceof = self.kb['P31']
        self.has_part = self.kb['P527']
        self.part_of = self.kb['P361']
        self.item_category = self.kb['/w/item/category']
        self.date_of_birth = self.kb['P569']
        self.date_of_death = self.kb['P570']
        self.inception = self.kb['P571']
        self.dad = self.kb['P576']  # dissolved, abolished or demolished
        self.wikimedia_category = self.kb['Q4167836']
        self.date_types = [
            self.kb['Q29964144'],  # year BC
            self.kb['Q577'],  # year
            self.kb['Q39911'],  # decade
            self.kb['Q578'],  # century
            self.kb['Q36507'],  # millennium
        ]
        self.human = self.kb['Q5']
        self.item = self.kb["item"]
        self.facts = self.kb["facts"]
        self.provenance = self.kb["provenance"]
        self.category = self.kb["category"]
        self.method = self.kb["method"]

        self.calendar = sling.Calendar(self.kb)
        self.names = sling.PhraseTable(
            self.kb, "local/data/e/wiki/en/phrase-table.repo")
        self.kb.freeze()
        self.date_type = {}
        self.conflicts = 0
Example #2
0
def init_phrase_table():
    base_path = '../../sling/'
    kb = sling.Store()
    kb.load(os.path.join(base_path, 'local/data/e/wiki/kb.sling'))
    names = sling.PhraseTable(
        kb, os.path.join(base_path, 'local/data/e/wiki/en/phrase-table.repo'))
    kb.freeze()
    return names
Example #3
0
    def init(self, task):
        self.kb = load_kb(task)
        self.names = sling.PhraseTable(self.kb,
                                       task.input("phrase-table").name)

        self.min_members = int(task.param("min_members"))
        self.num_parses_bins = [1, 2, 3, 5, 10, 20, 50, 100, 200]

        # Lookup some handles in advance.
        self.h_language = self.lookup("/lang/" + task.param("language"))
        self.h_lang = self.lookup("lang")
        self.main_topic = self.lookup("P301")  # present in topical categories
        self.h_member = self.lookup("/w/item/member")
        self.h_instanceof = self.lookup('P31')
        self.h_subclassof = self.lookup('P279')
        self.h_category = self.lookup('Q4167836')
        self.h_category_contains = self.lookup('P4224')
        self.english = task.param("language") == "en"

        # The following kinds of categories won't be processed.
        self.uninteresting_categories = set([
            self.lookup('Q20769287'),  # disambiguation category
            self.lookup('Q15407973'),  # list category
            #self.lookup('Q56428020'),  # template category
            self.lookup('Q23894233'),  # stub category
            self.lookup('Q24046192'),  # admin category
            self.lookup('Q15647814'),  # user category
            self.lookup('Q20010800'),  # user language category
            self.lookup('Q30432511'),  # meta category
            self.lookup('Q13331174')  # navbox category
        ])

        # These pids will not be considered as resolution for spans.
        self.pids_to_ignore = set([
            self.h_instanceof,  # P31 = instance of
            self.lookup('P279'),  # P279 = subclass of
            self.lookup('P971'),  # P971 = category combines topics
            self.lookup('P4224'),  # P4224 = category contains
        ])

        # These QIDs will not be considered as resolutions for spans.
        self.base_qids = set([
            self.lookup('Q5'),  # human
            self.lookup('Q215627'),  # person
            self.lookup('Q17334923'),  # location
            self.lookup('Q811430'),  # construction
            self.lookup('Q43229'),  # organization
            self.lookup('Q2385804'),  # educational institution
            self.lookup('Q294163'),  # public institution
            self.lookup('Q15401930'),  # product
            self.lookup('Q12737077'),  # occupation
            self.lookup('Q192581'),  # job
            self.lookup('Q4164871'),  # position
            self.lookup('Q216353')  # title
        ])
        self.extractor = sling.api.FactExtractor(self.kb)
Example #4
0
    def __init__(self):
        self.kb = sling.Store()
        self.kb.lockgc()
        self.kb.load("local/data/e/wiki/kb.sling", snapshot=True)
        self.instanceof = self.kb['P31']
        self.has_part = self.kb['P527']
        self.part_of = self.kb['P361']
        self.subclass = self.kb['P279']
        self.item_category = self.kb['/w/item/category']
        self.date_of_birth = self.kb['P569']
        self.date_of_death = self.kb['P570']
        self.inception = self.kb['P571']
        self.dad = self.kb['P576']  # dissolved, abolished or demolished
        self.wikimedia_category = self.kb['Q4167836']
        self.date_types = [
            self.kb['Q29964144'],  # year BC
            self.kb['Q577'],  # year
            self.kb['Q39911'],  # decade
            self.kb['Q578'],  # century
            self.kb['Q36507'],  # millennium
        ]
        self.human = self.kb['Q5']
        self.business = self.kb['Q4830453']
        self.organization = self.kb['Q43229']
        self.item = self.kb["item"]
        self.facts = self.kb["facts"]
        self.provenance = self.kb["provenance"]
        self.category = self.kb["category"]
        self.method = self.kb["method"]

        self.names = sling.PhraseTable(
            self.kb, "local/data/e/wiki/en/phrase-table.repo")
        self.kb.freeze()
        self.date_type = {}
        self.conflicts = 0

        self.months = {
            "January": 1,
            "February": 2,
            "March": 3,
            "April": 4,
            "May": 5,
            "June": 6,
            "July": 7,
            "August": 8,
            "September": 9,
            "October": 10,
            "November": 11,
            "December": 12
        }
Example #5
0
def pruneTriples(triple_file, output_file):
    """
    ******** This method needs to be called in a python env that has sling!!! *******
    Input: 1. name of file with tab sepperated SRO triples
           2. name of the output file
    Output: file containing triples where both entities have QCodes.
    """

    t1 = time.time()  # start the clock
    base_path = cfg['all']['base_path']
    kb = sling.Store()
    kb.load(base_path + "local/data/e/wiki/kb.sling")
    names = sling.PhraseTable(
        kb, base_path + "local/data/e/wiki/en/phrase-table.repo")
    kb.freeze()
    print("* Sling KB loaded in %0.3fs." % (time.time() - t1))

    verified_triples = []

    with open(triple_file, "r") as inFile:
        tsvreader = csv.reader(inFile, delimiter="\t")
        for triple in tsvreader:
            entity1 = triple[0]
            relation = triple[1]
            entity2 = triple[2]
            score = triple[3]
            hasBoth, qcode1, qcode2 = confirmEntities(names, entity1, entity2)
            if (hasBoth):
                q_triple = (qcode1, relation, qcode2, score)
                verified_triples.append(q_triple)

    with open(output_file, "w") as outFile:
        writer = csv.writer(outFile, delimiter='\t')
        for t in verified_triples:
            s = t[0]  # subject
            r = t[1]  # relation
            o = t[2]  # object
            c = t[3]  # confidence score
            writer.writerow([s, r, o, c])
Example #6
0
    def __init__(self):
        # Initialize commons store with knowledge base.
        start = time.time()
        self.commons = sling.Store()
        self.commons.lockgc()
        self.commons.load(wikidir + "/kb.sling", snapshot=True)
        self.n_item_member = self.commons['/w/item/member']
        self.n_instance_of = self.commons['P31']
        self.n_wikimedia_category = self.commons['Q4167836']
        self.n_subject = self.commons['subject']
        self.extractor = sling.FactExtractor(self.commons)

        # Add category subject types.
        self.subjects = {}
        for subject, item in english_subject_types.iteritems():
            self.subjects[subject] = self.commons[item]

        # Add properties for subjects.
        self.subject_properties = []
        for p in subject_properties:
            self.subject_properties.append(self.commons[p])

        self.commons.freeze()
        end = time.time()
        print end - start, "secs loading commons"

        # Load phrase table.
        # TODO(ringgaard): Load language-dependent phrase table.
        start = time.time()
        self.phrasetab = sling.PhraseTable(self.commons,
                                           wikidir + "/en/phrase-table.repo")
        end = time.time()
        print end - start, "secs loading phrase table"

        # Open category member database.
        self.member_db = sling.RecordDatabase(wikidir +
                                              "/wikipedia-members.rec")
Example #7
0
# 0: Registration Authority Code
# 1: Country
# 2: Country Code
# 3: Jurisdiction (country or region)
# 4: International name of Register
# 5: Local name of Register
# 6: International name of organisation responsible for the Register
# 7: Local name of organisation responsible for the Register
# 8: Website
# 9: Date IP disclaimer
# 10: Comments
# 11: End Date

kb = sling.Store()
kb.load("data/e/kb/kb.sling")
aliases = sling.PhraseTable(kb, "data/e/kb/en/phrase-table.repo")


def resolve_name(name):
    for item in aliases.lookup(name):
        return item
    return None


reader = csv.reader(open("data/c/lei/2019-12-05_ra-list-v1.5.csv", "r"))
reader.__next__()

for row in reader:
    slots = [("registration_authority_code", row[0]), ("country_name", row[1]),
             ("country_code", row[2]), ("country", resolve_name(row[1]))]
Example #8
0
    dictionary[key] = dictionary.get(key, 0) + delta


def fact_to_text(fact):
    l = []
    for f in fact:
        l.append(str(f.name))
    return ": ".join(l)


commons = sling.Store()
commons.lockgc()
commons.load(wikidir + "/kb.sling", snapshot=True)
n_is = commons["is"]
extractor = sling.FactExtractor(commons)
phrasetab = sling.PhraseTable(commons, wikidir + "/en/phrase-table.repo")
commons.freeze()


class Name:
    def __init__(self, doc):
        self.doc = doc
        self.store = doc.frame.store()
        self.covered = [False] * len(doc.tokens)
        self.evokes = {}
        self.matched = set()
        self.skip = []
        for t in self.doc.tokens:
            self.skip.append(t.word in stop_words)

    def overlaps(self, begin, end):
Example #9
0
        self.elements = [None] * (size * size)

    def index(self, i, j):
        return i * self.size + j - 1  # second index is 1-based

    def assign(self, i, j, span):
        self.elements[self.index(i, j)] = span

    def get(self, i, j):
        return self.elements[self.index(i, j)]


commons = sling.Store()
commons.lockgc()
commons.load("data/e/wiki/kb.sling")
phrasetab = sling.PhraseTable(commons, "data/e/wiki/en/phrase-table.repo")
docschema = sling.DocumentSchema(commons)
factex = sling.FactExtractor(commons)
taxonomy = factex.taxonomy()
titles = [
    commons['Q4164871'],  # position
    commons['Q12737077'],  # occupation
    commons['Q216353'],  # title
]
commons.freeze()

documentids = [
    #'Q5945076', 'Q23883660', 'Q43287478', 'Q2147524',
    #'Q25048736', 'Q6525874', 'Q3851366', 'Q308735', 'Q2184354',
    'Q5337174',
    'Q6218080',