Ejemplo n.º 1
0
    def run(self):
        self.args = self.parser2.parse_args()

        self.logger = Logger(self.args.resultsdir, self.args.resultsfilename, 'INFO')
        self.logger.startExperiment(sys.argv)
        self.log = self.logger.getLog()

        dbpath = os.path.dirname(self.args.input) + "/vlog"
        self.db = trident.Db(dbpath)

        #set up params postprocessing
        paramspostp = ParamsPostProcessing(self.args.post_threshold,
                                           self.args.post_nvars,
                                           self.args.post_minsize,
                                           self.args.post_minmatch,
                                           self.args.post_defaultincr,
                                           self.args.post2_enable,
                                           self.args.post2_increment,
                                           self.args.post2_topk)

        # Load the vectors
        with open(self.args.vectors, 'rb') as finv:
            vectors = pickle.load(finv)
        # Load the input
        with open(self.args.input, 'rb') as fin:
            data = pickle.load(fin)
        self.logger.setInput(data)
        self.dictr = data['relations']
        self.dicte = data['entities']

        nameSetToTest = self.args.testValid
        ev = vectors['ev']
        N = len(data['entities'])
        true_triples = data['train_subs'] + data['test_subs'] + data['valid_subs']
        with open(self.args.fileTestValid, 'rb') as fileTV:
            setToTest = pickle.load(fileTV)

        testMode = self.args.testmode

        while True:
            tester = self.evaluator(self.log, setToTest,
                                    true_triples,
                                    N, ev.getNExtVars(),
                                    ev, data['relations'],
                                    data['entities'],
                                    paramspostp,
                                    self.neval)

            # Do the test
            pos_v, fpos_v = tester.positions(vectors['model'], self.args.exvars, ev, self.db)
            fmrr_valid = self.ranking_scores(pos_v, fpos_v, 0, nameSetToTest)
            if testMode:
                print(fmrr_valid)
                # Change the parameters
            else:
                break


        self.logger.stopExperiment()
Ejemplo n.º 2
0
 def __init__(self, KBPATH: str):
     self.es = AsyncElasticsearch([{
         "host": "localhost",
         "port": 9200
     }],
                                  timeout=30)
     self.db = trident.Db(KBPATH)
     self.loop = asyncio.get_event_loop()
     self.cache: Dict[str, str] = {}
Ejemplo n.º 3
0
    def open(self, configuration: str, create=False):
        if not configuration:
            configuration = self.configuration
        if not os.path.isdir(configuration):
            raise Exception(f"Trident db path {configuration} does not exist!")

        import trident

        self._db = trident.Db(configuration)
        log.debug(f"Using Trident DB with {len(self)} triples")
        return rdflib.store.VALID_STORE
Ejemplo n.º 4
0
    def run(self):
        # parse comandline arguments
        self.args = self.parser.parse_args()
        if self.args.mode != 'rank':
            raise ValueError('Unknown experiment mode (%s)' % self.args.mode)
        self.callback = self.ranking_callback

        self.logger = Logger(self.args.resultsdir, self.args.resultsfilename,
                             self.args.loglevel)
        self.logger.startExperiment(sys.argv)
        self.log = self.logger.getLog()

        self.ev = None
        if self.args.extvars:
            self.evActive = True
        self.existing_model = None

        dbpath = os.path.dirname(self.args.fin) + "/vlog"
        self.db = trident.Db(dbpath)
        self.minSize = self.args.thre_min
        #self.maxSize = self.args.thre_max

        self.train()
        self.logger.stopExperiment()
Ejemplo n.º 5
0
        if len(sys.argv) == 4:
            warc_path = sys.argv[3]
            #cleantexts, doc_ids = read_warc(warc_path)

        if len(sys.argv) == 5:
            annotations_path = sys.argv[4]
    #else:
    cleantexts, doc_ids = read_warc(
        warc_path)  #Assuming path = './data/sample.warc.gz'

    n_docs = len(cleantexts)

    #sparql_domain = "localhost:1234"
    sparql_path = '/home/jurbani/data/motherkb-trident'
    db = trident.Db(sparql_path)

    # Hyperparameters, how many results of (elasticsearch and trident) to compare (for discard and popularity)
    k = 5
    q = 10
    r = 10
    min_score = 2
    min_ratio_value = 0.82
    small_value = 0.5
    title_multiplier = 3
    label_multiplier = 2
    matching_label_value = 3
    non_match_value = 2
    score_step_value = 1 / r

    # Tagger
Ejemplo n.º 6
0
 def __init__(self):
     self.db = trident.Db(TRIDENT_PATH)
Ejemplo n.º 7
0
def queries_and_get_best_id(set_info_and_id):
    set_info, id = set_info_and_id
    full_output = ""
    #---------------------- Queries-----------------------------------------------------------------------------------------
    # General
    sparql_query_title = "select distinct ?obj where { \
        <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/key/wikipedia.en_title> ?obj . \
        } "             # % (freebase_id)

    sparql_query_n_webpage = "select distinct ?obj where { \
        <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/ns/common.topic.topic_equivalent_webpage> ?obj . \
        } "             # % (freebase_id)

    # Is social media active (Not consistent for persons) #Avg 2 or 1
    sparql_query_media_presence = "select distinct * where { \
       <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/ns/common.topic.social_media_presence> ?obj .\
        } "             # % (freebase_id)

    # Not yahoo and disney , Avg 1
    # Is social media active (Not consisten for persons?)
    sparql_query_twitter = "select distinct * where { \
       <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/key/authority.twitter> ?obj .\
        } "             # % (freebase_id)

    # Avg 1 or 2
    # Is social media active (Not consisten for persons?)  #New york times of Flash player bijv
    sparql_query_website = "select distinct * where { \
       <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/ns/common.topic.official_website> ?obj .\
    } "         # % (freebase_id)

    # ------------Location-----------------------------------------------------------------------------------------------------

    sparql_query_location = "select distinct * where {\
     { <http://rdf.freebase.com/ns%s> ?rel <http://rdf.freebase.com/ns/location.location>  .  }\
     UNION \
     { <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/ns/base.biblioness.bibs_location.loc_type> ?type . } \
      } "           # % (freebase_id,freebase_id)

    # ----------Person------------------------------------------------------------------------------------------------------

    sparql_query_person = "select distinct * where { \
       <http://rdf.freebase.com/ns%s> ?rel <http://rdf.freebase.com/ns/people.person> .\
        } "             # % (freebase_id)

    # Scenario schrijver Tony Gilroy is a not a celebrity and not a notable name either.
    sparql_query_person_nndb = "select distinct * where { \
       <http://rdf.freebase.com/ns%s> ?rel <http://rdf.freebase.com/ns/user.narphorium.people.nndb_person> .\
        } "             # % (freebase_id)

    # Jeremy Renner is notable name but does not have this type celebrity
    sparql_query_person_celeb = "select distinct * where { \
       <http://rdf.freebase.com/ns%s> ?rel <http://rdf.freebase.com/ns/base.popstra.celebrity> .\
        } "             # % (freebase_id)

    # Tony Gilroy (and Matt damon) has these properties, politician not:
    sparql_query_entertainment = "select distinct * where {\
     { <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/key/source.entertainmentweekly.person> ?name  .  }\
     UNION \
     { <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/key/source.filmstarts.personen> ?name2 . } \
      } "           # % (freebase_id,freebase_id)

    # --------Organization------------------------------------------------------------------------------------------------------

    # Should be the first one if company
    sparql_query_company = "select distinct * where { \
        <http://rdf.freebase.com/ns%s> <http://rdf.freebase.com/key/authority.crunchbase.company> ?obj . \
        } "             # % (freebase_id)
    #Skipped other ones
    #--------Other---------------------------------------------------------------------------------------------------------------

    # Also Flash player etc.
    sparql_query_inanimate = "select distinct * where { \
       <http://rdf.freebase.com/ns%s> ?rel <http://rdf.freebase.com/ns/base.type_ontology.inanimate> .\
        } "             # % (freebase_id)

    # ------------------------------------------------------------------------------------------------------------------------

    query_label_order = {
        "GPE": ["GPE", "ORGANIZATION"],
        "GSP": ["GPE", "ORGANIZATION"],
        "LOCATION": ["GPE", "ORGANIZATION"],
        "PERSON": ["PERSON", "GPE"],
        "ORGANIZATION": ["ORGANIZATION", "GPE"],
        "FACILITY": ["ORGANIZATION", "GPE"],
    }
    r = 10
    small_value = 0.5
    title_multiplier = 3
    label_multiplier = 2
    matching_label_value = 3
    non_match_value = 2
    score_step_value = 1 / r
    sparql_path = '/home/jurbani/data/motherkb-trident'
    db = trident.Db(sparql_path)

    final_set = set()
    for word_info in set_info:
        word, label, set_response, elastic_scores, matching_ratio = word_info
        elastic_id_labels = list(set_response)

        total_scores = {}
        label_order = query_label_order[label]

        for freebase_id in elastic_id_labels[:r]:

            current_score_of_id = 0

            # Do Sparql query
            modified_id = freebase_id[:2] + "." + freebase_id[3:]

            # General
            query = sparql_query_title % (modified_id)
            #output_query = sparql(sparql_domain, query)
            output_query = json.loads(db.sparql(query))
            results = int(output_query["stats"].get("nresults", -1))
            if results == -1:  #Incorrect ID otherwise
                total_scores[freebase_id] = 0
                continue

            #print(word,freebase_id)
            #print("results title ", results)
            if results > 0:
                #print("heb title gevonden\n")
                title = output_query["results"]["bindings"][0]["obj"][
                    "value"]  # Assumming only 1 title on index 0
                title = title.translate({
                    ord('"'): None,
                    ord('$'): " ",
                    ord('_'): " "
                })
                lowered_title = title.lower()
                lowered_word = word.lower()
                max_ratio = SequenceMatcher(None, lowered_word,
                                            lowered_title).ratio()

                # Extra check for aliases or per word
                #splitted_title = lowered_title.split()
                #n_words = len(splitted_title)
                #if n_words == len(word):
                #    initials = ""
                #    for w in splitted_title:
                #        letter = w[0]
                #        initials += letter
                #    ratio = SequenceMatcher(None, lowered_word, initials).ratio()
                #    if ratio > max_ratio:
                #        max_ratio = ratio
                #elif n_words > 1:
                #    for w in splitted_title:
                #        ratio = SequenceMatcher(None, w, lowered_title).ratio()
                #        if ratio > max_ratio:
                #            max_ratio = ratio

                #tuple_info_ratio = (freebase_id, max_ratio)
                #sorted_tuples_title_ratio.append(tuple_info_ratio)
                #total_scores[freebase_id] = max_ratio * 30
                current_score_of_id += max_ratio * title_multiplier  #

            # Add n_labels to list, where score is given for ranking on that is given after the for loop
            n_labels = len(set_response[freebase_id])
            #tuple_info = (freebase_id, n_labels)
            #sorted_tuples_n_labels.append(tuple_info)
            #total_scores[freebase_id] += min(int(n_labels / 5),20) / 10
            #total_scores[freebase_id] = n_labels
            current_score_of_id += (min(int(n_labels / 5), 20) /
                                    20) * label_multiplier

            query = sparql_query_n_webpage % (modified_id)
            #output_query = sparql(sparql_domain, query)
            output_query = json.loads(db.sparql(query))
            n_results = int(output_query["stats"]["nresults"])
            #print("Results webpage ",n_results)
            #tuple_info_n_results = (freebase_id, n_results)
            #sorted_tuples_n_pages.append(tuple_info_n_results)
            current_score_of_id += min(int(n_results / 5), 20) / 20

            #query = sparql_query_media_presence % (modified_id)
            #output_query = sparql(sparql_domain, query)
            #output_query = json.loads(db.sparql(query))
            #n_results = int(output_query["stats"]["nresults"])
            #print("results media ",n_results)
            #current_score_of_id += (n_results > 0) * small_value

            #query = sparql_query_twitter % (modified_id)
            #output_query = sparql(sparql_domain, query)
            #output_query = json.loads(db.sparql(query))
            #n_results = int(output_query["stats"]["nresults"])
            #print("results twitter ", n_results)
            #current_score_of_id += (n_results > 0) * small_value

            query = sparql_query_website % (modified_id)
            #output_query = sparql(sparql_domain, query)
            output_query = json.loads(db.sparql(query))
            n_results = int(output_query["stats"]["nresults"])
            #print("results website ", n_results)
            current_score_of_id += (n_results > 0) * small_value

            match = False

            for i, label in enumerate(label_order):

                addition = matching_label_value - i  #value 3 if correct, 2 if second, 1 if third or lower

                if label == "GPE":
                    query = sparql_query_location % (modified_id, modified_id)
                    #output_query = sparql(sparql_domain, query)
                    output_query = json.loads(db.sparql(query))
                    n_results = int(output_query["stats"]["nresults"])
                    #print("results location ",n_results)
                    if n_results:
                        current_score_of_id += addition
                        match = True

                elif label == "PERSON":
                    query = sparql_query_person % (modified_id)
                    #output_query = sparql(sparql_domain, query)
                    output_query = json.loads(db.sparql(query))
                    n_results = int(output_query["stats"]["nresults"])
                    #print("results person ",n_results)
                    if n_results:
                        current_score_of_id += addition
                        match = True

                    #query = sparql_query_person_nndb % (modified_id)
                    #output_query = sparql(sparql_domain, query)
                    #output_query = json.loads(db.sparql(query))
                    #n_results = int(output_query["stats"]["nresults"])
                    #print("results nndb ", n_results)
                    #if n_results:
                    #    current_score_of_id += small_value #addition
                    #    match = True

                    #query = sparql_query_person_celeb % (modified_id)
                    #output_query = sparql(sparql_domain, query)
                    #output_query = json.loads(db.sparql(query))
                    #n_results = int(output_query["stats"]["nresults"])
                    #print("results celeb ", n_results)
                    #if n_results:
                    #    current_score_of_id += small_value #addition
                    #    match = True

                    #query = sparql_query_entertainment % (modified_id, modified_id)
                    #output_query = sparql(sparql_domain, query)
                    #output_query = json.loads(db.sparql(query))
                    #n_results = int(output_query["stats"]["nresults"])
                    #print("results enter ", n_results)
                    #if n_results:
                    #    current_score_of_id += small_value #addition
                    #    match = True
                elif label == "ORGANIZATION":
                    query = sparql_query_company % (modified_id)
                    #output_query = sparql(sparql_domain, query)
                    output_query = json.loads(db.sparql(query))
                    n_results = int(output_query["stats"]["nresults"])
                    #print("results org ", n_results)
                    if n_results:
                        current_score_of_id += addition
                        match = True

                elif label == "OTHER":
                    query = sparql_query_inanimate % (modified_id)
                    #output_query = sparql(sparql_domain, query)
                    output_query = json.loads(db.sparql(query))
                    n_results = int(output_query["stats"]["nresults"])
                    #print("Other results ",n_results)
                    if n_results:
                        current_score_of_id += 1

                if match:
                    break

            total_scores[freebase_id] = current_score_of_id

        #elastic_scores.sort(key=itemgetter(1))
        #matching_ratio.sort(key=itemgetter(1))

        #for i, tuple_value in enumerate(elastic_scores):
        #    freebase_id = tuple_value[0]
        #    if freebase_id in total_scores:
        #        score = ((i + 1) * score_step_value)
        #        total_scores[freebase_id] += score

        #for i, tuple_value in enumerate(matching_ratio):
        #    freebase_id = tuple_value[0]
        #    if freebase_id in total_scores:
        #        score = ((i + 1) * score_step_value)
        #        total_scores[freebase_id] += score

        best_id_key = max(total_scores.items(), key=itemgetter(1))[0]
        final_set.add((word, best_id_key))
        #line = doc_key + '\t' + word + '\t' + best_id_key + "\n"
        #full_output += line

    return final_set, id
 def db(self):
     if self._kb is None:
         self._kb = trident.Db(self.kb_path)
         return self._kb
     return self._kb
Ejemplo n.º 9
0
import trident
import json

KBPATH = "assets/wikidata-20200203-truthy-uri-tridentdb"

# Retrieve first 10 entities of type (P31) city (Q515)
query = (
    "PREFIX wde: <http://www.wikidata.org/entity/> "
    "PREFIX wdp: <http://www.wikidata.org/prop/direct/> "
    "PREFIX wdpn: <http://www.wikidata.org/prop/direct-normalized/> "
    "select ?s where { ?s wdp:P31 wde:Q515 . } LIMIT 10"
)

# Load the KB
db = trident.Db(KBPATH)
results = db.sparql(query)
json_results = json.loads(results)

print("*** VARIABLES ***")
variables = json_results["head"]["vars"]
print(variables)

print("\n*** BINDINGS ***")
results = json_results["results"]
for b in results["bindings"]:
    line = ""
    for var in variables:
        line += var + ": " + b[var]["value"] + " "
    print(line)

print("\n*** STATISTICS ***")
Ejemplo n.º 10
0
def get_pop(url):
    db = trident.Db("/app/assignment/assets/wikidata-20200203-truthy-uri-tridentdb")

    id = db.lookup_id("<http://www.wikidata.org/entity/Q11268>")
    print(db.count_o(id))