Exemple #1
0
 def __init__(self, noun_phrase, matching={}):
     self.noun_phrase = noun_phrase
     wikidata_results = []
     try:
         results = None
         if noun_phrase.text in matching:
             results = matching[noun_phrase.text]
             if results is None:
                 if RETRY_PARALLEL_MATCHING:
                     results = Wikidata.search_by_label(noun_phrase.text)
                 else:
                     raise WikidataItemsNotFound()
         else:
             results = Wikidata.search_by_label(noun_phrase.text)
         for result in results:
             wikidata_item = WikidataItem.from_search_result(result)
             wikidata_results.append(wikidata_item)
     except WikidataItemsNotFound:
         # print("WikidataItemsNotFound ", noun_phrase)
         pass
     # DBpedia items
     self.batch = []
     for item in wikidata_results:
         self.batch.append(UniversalItem.from_wikidata_item(item))
     if len(self.batch) == 0:
         raise EmptyItemsBatch()
     for i in range(PRIMARY_COUNT):
         try:
             self.batch[i].primary = True
         except IndexError:
             break
Exemple #2
0
 def substitutes(self, strict=False):
     if strict:
         from_item, to_item, triples = self.construct_sparql_strict()
     else:
         from_item, to_item, triples = self.construct_sparql()
     template = """
     SELECT  {} {}Label {} {}Label WHERE {{
         {}
         SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
     }} LIMIT 500
     """
     query = template.format(from_item,
                             from_item,
                             to_item,
                             to_item,
                             triples)
     # print(query)
     try:
         response = Wikidata.sparql(query)
     except NoSPARQLResponse:
         return None, []
     count = len(response['results']['bindings'])
     substitutes = []
     for path in response['results']['bindings']:
         question = path[from_item[1:]+'Label']['value']
         answer = path[to_item[1:]+'Label']['value']
         substitutes.append((question, answer, ))
     return count, substitutes
Exemple #3
0
 def from_get_result(cls, data):
     item_id = data['id']
     label = None
     try:
         label = data['labels']['en']['value']
     except KeyError:
         raise NoEnglishLabelAvailable()
     description = None
     try:
         description = data['description']['en']['value']
     except KeyError:
         pass
     if 'claims' in data:
         claims = Wikidata.extract_claims(data)
     # print(item_id, label, claims)
     return cls(item_id,
                label=label,
                description=description,
                claims=claims)
Exemple #4
0
def matching_parallel(indexed_noun_phrases, disable_wordnet=False):
    # optimization step
    # parallel enitites linking
    # building paired permutation initially
    permuted_noun_phrases = []
    matching_queries = []
    for idx, noun_phrase in indexed_noun_phrases:
        permutations = noun_phrase.get_permutations(
            disable_wordnet=disable_wordnet)
        permuted_noun_phrases.append((
            idx,
            noun_phrase,
            permutations,
        ))
        for permutation in permutations:
            matching_queries.append(permutation.text)
    # parallel linking step
    matching = Wikidata.search_by_label_parallel(matching_queries)
    return permuted_noun_phrases, matching
Exemple #5
0
 def apply_path(self, from_item):
     from_item = "wd:{}".format(from_item)
     _, to_item, triples = self.construct_sparql()
     template = """
     SELECT {} {}Label WHERE {{
         {}
         SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
     }} LIMIT 5
     """
     query = template.format(to_item,
                             to_item,
                             triples)
     query = query.replace("?item0", from_item)
     # print(query)
     try:
         response = Wikidata.sparql(query)
     except NoSPARQLResponse:
         return None, []
     count = len(response['results']['bindings'])
     answers = []
     for path in response['results']['bindings']:
         answer = path[to_item[1:]+'Label']['value']
         answers.append(answer)
     return count, answers
Exemple #6
0
def main():
    """
    Generation of the dataset.
    """
    # check number of arguments
    if len(sys.argv) < 2:
        sys.exit("Too few arguments, please sepcify"
                 " directory as a first argument.")
    # select QALD dataset
    directory = sys.argv[1]
    files_json = list(pathlib.Path(directory).glob('*.json'))

    # resulting dataset
    result = {}
    result["questions"] = []

    # for every dataset
    for file_json in files_json:
        info('Opening', file_json)
        with open(file_json) as data_file:
            data = json.load(data_file)

        dataset_id = data['dataset']['id']
        info("Dataset:\t", dataset_id)

        extended = False
        if isinstance(data['questions'][0]['question'], dict):
            extended = True
        info("Extended:\t", extended)

        question_number = len(data['questions'])
        info("Questions:\t", question_number)

        # dataset metrics
        skipped = 0
        interlinked = 0
        resources_questions = 0

        # for every question from the dataset
        for question in data['questions']:
            # get question
            string = None
            keywords = []
            multilingual = None
            if not extended:
                multilingual = question['question']
            else:
                multilingual = question['question']['language']
            for lang_question in multilingual:
                if lang_question["language"] == "en":
                    if not extended:
                        string = lang_question['string']
                    else:
                        string = lang_question['question']
                    if 'keywords' in lang_question:
                        keywords = lang_question['keywords'].split(',')
            keywords = [keyword.strip() for keyword in keywords]

            # check skip
            skip_flag = False
            for added_question in result["questions"]:
                if string == added_question["string"]:
                    skip_flag = True
                    break
            if skip_flag:
                skipped += 1
                continue

            # filter keywords
            items = []
            for keyword in keywords:
                in_question = True
                for word in keyword.split(' '):
                    if not word.lower() in string.lower():
                        in_question = False
                is_noun_phrase = False
                collocation = nltk.word_tokenize(keyword)
                for word in nltk.pos_tag(collocation):
                    if word[1] in ["NN", "NNS", "NNP", "NNPS"]:
                        is_noun_phrase = True
                if in_question and is_noun_phrase:
                    items.append(keyword)

            # answer metadata
            if not extended:
                answertype = question["answertype"]
            else:
                answertype = question["question"]["answertype"]
            if not extended:
                aggregation = question["aggregation"]
            else:
                aggregation = question["question"]["metadata"]["aggregation"]

            if answertype == 'resource':
                resources_questions += 1

            # get answers
            result_answers = []
            answers = None
            try:
                if not extended:
                    answers = question["answers"][0]
                else:
                    answers = question["question"]["answers"]
                skip_answers_flag = False
                try:
                    answer_attr = answers['head']['vars'][0]
                except KeyError:
                    if 'boolean' in answers:
                        answertype = 'boolean'
                        result_answers.append(answers['boolean'])
                        skip_answers_flag = True
                    else:
                        raise
                if not skip_answers_flag:
                    answers = answers['results']['bindings']
                    if len(answers) > ANSWERS_LIMIT:
                        skipped += 1
                        continue
                    every_answer_interlinked = True
                    for answer in answers:
                        if answer_attr not in answer:
                            answer_attr = '"' + answer_attr + '"'
                        value = answer[answer_attr]['value']
                        result_answer = {'answertype': answertype}
                        if answertype != 'resource':
                            result_answer['value'] = value
                        else:
                            if value.startswith('http://dbpedia.org/'):
                                result_answer['dbpedia'] = value
                                try:
                                    wikidata = DBpedia.wd_from_link(value)
                                except NoSPARQLResponse:
                                    wikidata = None
                                if wikidata is not None:
                                    result_answer['wikidata'] = wikidata
                                else:
                                    every_answer_interlinked = False
                            elif value.startswith('http://www.wikidata.org/'):
                                result_answer['wikidata'] = value
                        # append labels
                        if 'wikidata' in result_answer:
                            answer_label = Wikidata.get_label_by_uri(
                                result_answer['wikidata'])
                            result_answer['wikidata_label'] = answer_label
                        result_answers.append(result_answer)
                    # count interlinked values
                    if every_answer_interlinked and (answertype == 'resource'):
                        interlinked += 1
            except IndexError:
                pass  # no answer available

            # add question
            result_question = {
                'string': string,
                'keywords': keywords,
                'items': items,
                'answertype': answertype,
                'aggregation': aggregation,
                'answers': result_answers,
                'dataset': dataset_id
            }
            result["questions"].append(result_question)

        info("Skipped:\t", skipped)
        info("Added:\t", question_number - skipped)
        info("Interlinked:\t", interlinked)
        info("Resources:\t", resources_questions)

    # print the final dataset to STDOUT
    print(json.dumps(result, indent=4, sort_keys=True))
Exemple #7
0
 # searching for a semantic path
 print("Matching:", question)
 start_time = time.time()
 try:
     entities_sets = qa_system.get_entities_set(question)
 except InvalidEntitiesSet:
     continue
 matching_processing_time = time.time() - start_time
 MATCHING_TIMES.append(matching_processing_time)
 # evaluating found semantic path
 # transforming entities set into labels
 items = []
 for entity_set in entities_sets:
     items += entity_set.items
 item_ids = [item.wd_item_id for item in items]
 entities = Wikidata.get_items(item_ids)
 labels = []
 for _, entity in entities.items():
     if 'labels' in entity:
         if 'en' in entity['labels']:
             labels.append(entity['labels']['en']['value'])
 unique_labels = list(set(labels))
 print(unique_labels)
 print(qald_items)
 # calculate measures
 unique_labels = [string.lower() for string in unique_labels]
 qald_items = [string.lower() for string in qald_items]
 true_positive = 0
 false_negatives = 0
 false_positives = 0
 for item in qald_items:
Exemple #8
0
import random

sys.path.append('/Users/kusha/qas')
from qas.wikidata import Wikidata
from qas.core import QASystem

with open("dataset.json") as data_file:
    data = json.load(data_file)

pairs = []
for question in data['questions']:
    if question['answertype'] == 'resource':
        if len(question['answers']) == 1:
            if "wikidata" in question['answers'][0]:
                answer_uri = question['answers'][0]["wikidata"]
                answer_label = Wikidata.get_label_by_uri(answer_uri)
                print(answer_uri, answer_label)
                pairs.append((
                    question['string'],
                    answer_label,
                ))

random.shuffle(pairs)

success = 0
processed = 0
with QASystem(db_filename="knowledge.db") as qa_system:
    for question, answer in pairs:
        print("suppress output")
        # sys.stdout = os.devnull
        # sys.stderr = os.devnull
Exemple #9
0
    def connect(self, *labels, interrupt="first"):

        print("==== CONNECTION OVER GRAPH ====")

        # directions of search
        # for a basic example: [['question', 'answer']]
        directions = [list(pair)
                      for pair in itertools.combinations(labels, 2)]

        # dictionary for final solutions
        # frozenset is a key, path is a value
        self.solutions = {}

        timeout = None
        # for path length until maximum
        path_length_at_times = []
        for path_length in range(1, MAX_PATH_LENGTH):
            # save processing time measure
            path_length_at_times.append(time.time())

            if timeout is not None:
                timeout = (timeout + 5.0) ** 2

            # optimization step, async SPARQL querying
            if not DISABLE_PARALLEL:
                sparql_queries = []
                for direction in directions:
                    if self.skip_direction(path_length, direction):
                        continue
                    for (item_from, item_to), link_config in \
                            self.path_comb(direction, path_length):
                        query = self.construct_query(link_config,
                                                     item_from,
                                                     item_to)
                        sparql_queries.append(query)
                print("Timeout for path length", path_length, ":", timeout)
                sparql_responses, timeout = Wikidata.sparql_parallel(
                    sparql_queries,
                    timeout=timeout)
                print("Elapsed at path length", path_length, ":", timeout)
            else:
                # print("parallel querying is disabled")
                sparql_responses, timeout = {}, None

            # for direction between labels (question -> answer)
            for direction in directions:
                print("Length: {}, Labels: {} -> {}:".format(
                    path_length, direction[0], direction[1]))

                if self.skip_direction(path_length, direction):
                    continue

                pathes_at_length = []

                for (item_from, item_to), link_config in \
                        self.path_comb(direction, path_length):
                    query = self.construct_query(link_config, item_from, item_to)
                    response = None
                    # use preloaded parallel results
                    if query in sparql_responses:
                        response = sparql_responses[query]
                        if response is None:
                            if RETRY_PARALLEL_SPARQL or DISABLE_PARALLEL:
                                try:
                                    response = Wikidata.sparql(query)
                                except NoSPARQLResponse:
                                    print("RTRETIME @",
                                          self.pp_link_config(link_config))
                                    continue
                            else:
                                print("PARNONE @",
                                      self.pp_link_config(link_config))
                                continue
                    else:
                        try:
                            response = Wikidata.sparql(query)
                        except NoSPARQLResponse:
                            print("TIMEOUT @",
                                  self.pp_link_config(link_config))
                            continue
                    pathes = self.process_response(response)
                    pathes = [Path(path, link_config, item_from, item_to)
                              for path in pathes]
                    pathes = self.filter_pathes(pathes)
                    if len(pathes) == 0:
                        print("NO_CONN @",
                              self.pp_link_config(link_config))
                        continue
                    print("SUCCESS @",
                          self.pp_link_config(link_config))
                    if len(pathes) <= 3:
                        for path in pathes:
                            print(path)
                    else:
                        print("[ ... {} paths found ... ]".format(
                            len(pathes)))
                    pathes_at_length += pathes
                if len(pathes_at_length):
                    if frozenset(direction) in self.solutions:
                        self.solutions[frozenset(direction)] += pathes_at_length
                    else:
                        self.solutions[frozenset(direction)] = pathes_at_length
        # print processing time info
        path_length_at_times.append(time.time())
        print("-" * 20)
        for idx, timestamp in list(enumerate(path_length_at_times))[1:]:
            processing_time = timestamp - path_length_at_times[idx-1]
            print('TIME AT LENGTH {}: {:.4f}'.format(idx, processing_time, ))

        for direction, pathes in self.solutions.items():
            # print(direction)
            min_length = min([path.length for path in pathes])
            pathes = [path
                      for path in pathes
                      if path.length == min_length]
            # pathes = sorted(pathes, key=lambda x: x.length)
            # for path in pathes:
            #     print(path)
        # print(self.solutions)
        return self.solutions