def __init__(self, noun_phrase, matching={}): self.noun_phrase = noun_phrase wikidata_results = [] try: results = None if noun_phrase.text in matching: results = matching[noun_phrase.text] if results is None: if RETRY_PARALLEL_MATCHING: results = Wikidata.search_by_label(noun_phrase.text) else: raise WikidataItemsNotFound() else: results = Wikidata.search_by_label(noun_phrase.text) for result in results: wikidata_item = WikidataItem.from_search_result(result) wikidata_results.append(wikidata_item) except WikidataItemsNotFound: # print("WikidataItemsNotFound ", noun_phrase) pass # DBpedia items self.batch = [] for item in wikidata_results: self.batch.append(UniversalItem.from_wikidata_item(item)) if len(self.batch) == 0: raise EmptyItemsBatch() for i in range(PRIMARY_COUNT): try: self.batch[i].primary = True except IndexError: break
def substitutes(self, strict=False): if strict: from_item, to_item, triples = self.construct_sparql_strict() else: from_item, to_item, triples = self.construct_sparql() template = """ SELECT {} {}Label {} {}Label WHERE {{ {} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }} }} LIMIT 500 """ query = template.format(from_item, from_item, to_item, to_item, triples) # print(query) try: response = Wikidata.sparql(query) except NoSPARQLResponse: return None, [] count = len(response['results']['bindings']) substitutes = [] for path in response['results']['bindings']: question = path[from_item[1:]+'Label']['value'] answer = path[to_item[1:]+'Label']['value'] substitutes.append((question, answer, )) return count, substitutes
def from_get_result(cls, data): item_id = data['id'] label = None try: label = data['labels']['en']['value'] except KeyError: raise NoEnglishLabelAvailable() description = None try: description = data['description']['en']['value'] except KeyError: pass if 'claims' in data: claims = Wikidata.extract_claims(data) # print(item_id, label, claims) return cls(item_id, label=label, description=description, claims=claims)
def matching_parallel(indexed_noun_phrases, disable_wordnet=False): # optimization step # parallel enitites linking # building paired permutation initially permuted_noun_phrases = [] matching_queries = [] for idx, noun_phrase in indexed_noun_phrases: permutations = noun_phrase.get_permutations( disable_wordnet=disable_wordnet) permuted_noun_phrases.append(( idx, noun_phrase, permutations, )) for permutation in permutations: matching_queries.append(permutation.text) # parallel linking step matching = Wikidata.search_by_label_parallel(matching_queries) return permuted_noun_phrases, matching
def apply_path(self, from_item): from_item = "wd:{}".format(from_item) _, to_item, triples = self.construct_sparql() template = """ SELECT {} {}Label WHERE {{ {} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }} }} LIMIT 5 """ query = template.format(to_item, to_item, triples) query = query.replace("?item0", from_item) # print(query) try: response = Wikidata.sparql(query) except NoSPARQLResponse: return None, [] count = len(response['results']['bindings']) answers = [] for path in response['results']['bindings']: answer = path[to_item[1:]+'Label']['value'] answers.append(answer) return count, answers
def main(): """ Generation of the dataset. """ # check number of arguments if len(sys.argv) < 2: sys.exit("Too few arguments, please sepcify" " directory as a first argument.") # select QALD dataset directory = sys.argv[1] files_json = list(pathlib.Path(directory).glob('*.json')) # resulting dataset result = {} result["questions"] = [] # for every dataset for file_json in files_json: info('Opening', file_json) with open(file_json) as data_file: data = json.load(data_file) dataset_id = data['dataset']['id'] info("Dataset:\t", dataset_id) extended = False if isinstance(data['questions'][0]['question'], dict): extended = True info("Extended:\t", extended) question_number = len(data['questions']) info("Questions:\t", question_number) # dataset metrics skipped = 0 interlinked = 0 resources_questions = 0 # for every question from the dataset for question in data['questions']: # get question string = None keywords = [] multilingual = None if not extended: multilingual = question['question'] else: multilingual = question['question']['language'] for lang_question in multilingual: if lang_question["language"] == "en": if not extended: string = lang_question['string'] else: string = lang_question['question'] if 'keywords' in lang_question: keywords = lang_question['keywords'].split(',') keywords = [keyword.strip() for keyword in keywords] # check skip skip_flag = False for added_question in result["questions"]: if string == added_question["string"]: skip_flag = True break if skip_flag: skipped += 1 continue # filter keywords items = [] for keyword in keywords: in_question = True for word in keyword.split(' '): if not word.lower() in string.lower(): in_question = False is_noun_phrase = False collocation = nltk.word_tokenize(keyword) for word in nltk.pos_tag(collocation): if word[1] in ["NN", "NNS", "NNP", "NNPS"]: is_noun_phrase = True if in_question and is_noun_phrase: items.append(keyword) # answer metadata if not extended: answertype = question["answertype"] else: answertype = question["question"]["answertype"] if not extended: aggregation = question["aggregation"] else: aggregation = question["question"]["metadata"]["aggregation"] if answertype == 'resource': resources_questions += 1 # get answers result_answers = [] answers = None try: if not extended: answers = question["answers"][0] else: answers = question["question"]["answers"] skip_answers_flag = False try: answer_attr = answers['head']['vars'][0] except KeyError: if 'boolean' in answers: answertype = 'boolean' result_answers.append(answers['boolean']) skip_answers_flag = True else: raise if not skip_answers_flag: answers = answers['results']['bindings'] if len(answers) > ANSWERS_LIMIT: skipped += 1 continue every_answer_interlinked = True for answer in answers: if answer_attr not in answer: answer_attr = '"' + answer_attr + '"' value = answer[answer_attr]['value'] result_answer = {'answertype': answertype} if answertype != 'resource': result_answer['value'] = value else: if value.startswith('http://dbpedia.org/'): result_answer['dbpedia'] = value try: wikidata = DBpedia.wd_from_link(value) except NoSPARQLResponse: wikidata = None if wikidata is not None: result_answer['wikidata'] = wikidata else: every_answer_interlinked = False elif value.startswith('http://www.wikidata.org/'): result_answer['wikidata'] = value # append labels if 'wikidata' in result_answer: answer_label = Wikidata.get_label_by_uri( result_answer['wikidata']) result_answer['wikidata_label'] = answer_label result_answers.append(result_answer) # count interlinked values if every_answer_interlinked and (answertype == 'resource'): interlinked += 1 except IndexError: pass # no answer available # add question result_question = { 'string': string, 'keywords': keywords, 'items': items, 'answertype': answertype, 'aggregation': aggregation, 'answers': result_answers, 'dataset': dataset_id } result["questions"].append(result_question) info("Skipped:\t", skipped) info("Added:\t", question_number - skipped) info("Interlinked:\t", interlinked) info("Resources:\t", resources_questions) # print the final dataset to STDOUT print(json.dumps(result, indent=4, sort_keys=True))
# searching for a semantic path print("Matching:", question) start_time = time.time() try: entities_sets = qa_system.get_entities_set(question) except InvalidEntitiesSet: continue matching_processing_time = time.time() - start_time MATCHING_TIMES.append(matching_processing_time) # evaluating found semantic path # transforming entities set into labels items = [] for entity_set in entities_sets: items += entity_set.items item_ids = [item.wd_item_id for item in items] entities = Wikidata.get_items(item_ids) labels = [] for _, entity in entities.items(): if 'labels' in entity: if 'en' in entity['labels']: labels.append(entity['labels']['en']['value']) unique_labels = list(set(labels)) print(unique_labels) print(qald_items) # calculate measures unique_labels = [string.lower() for string in unique_labels] qald_items = [string.lower() for string in qald_items] true_positive = 0 false_negatives = 0 false_positives = 0 for item in qald_items:
import random sys.path.append('/Users/kusha/qas') from qas.wikidata import Wikidata from qas.core import QASystem with open("dataset.json") as data_file: data = json.load(data_file) pairs = [] for question in data['questions']: if question['answertype'] == 'resource': if len(question['answers']) == 1: if "wikidata" in question['answers'][0]: answer_uri = question['answers'][0]["wikidata"] answer_label = Wikidata.get_label_by_uri(answer_uri) print(answer_uri, answer_label) pairs.append(( question['string'], answer_label, )) random.shuffle(pairs) success = 0 processed = 0 with QASystem(db_filename="knowledge.db") as qa_system: for question, answer in pairs: print("suppress output") # sys.stdout = os.devnull # sys.stderr = os.devnull
def connect(self, *labels, interrupt="first"): print("==== CONNECTION OVER GRAPH ====") # directions of search # for a basic example: [['question', 'answer']] directions = [list(pair) for pair in itertools.combinations(labels, 2)] # dictionary for final solutions # frozenset is a key, path is a value self.solutions = {} timeout = None # for path length until maximum path_length_at_times = [] for path_length in range(1, MAX_PATH_LENGTH): # save processing time measure path_length_at_times.append(time.time()) if timeout is not None: timeout = (timeout + 5.0) ** 2 # optimization step, async SPARQL querying if not DISABLE_PARALLEL: sparql_queries = [] for direction in directions: if self.skip_direction(path_length, direction): continue for (item_from, item_to), link_config in \ self.path_comb(direction, path_length): query = self.construct_query(link_config, item_from, item_to) sparql_queries.append(query) print("Timeout for path length", path_length, ":", timeout) sparql_responses, timeout = Wikidata.sparql_parallel( sparql_queries, timeout=timeout) print("Elapsed at path length", path_length, ":", timeout) else: # print("parallel querying is disabled") sparql_responses, timeout = {}, None # for direction between labels (question -> answer) for direction in directions: print("Length: {}, Labels: {} -> {}:".format( path_length, direction[0], direction[1])) if self.skip_direction(path_length, direction): continue pathes_at_length = [] for (item_from, item_to), link_config in \ self.path_comb(direction, path_length): query = self.construct_query(link_config, item_from, item_to) response = None # use preloaded parallel results if query in sparql_responses: response = sparql_responses[query] if response is None: if RETRY_PARALLEL_SPARQL or DISABLE_PARALLEL: try: response = Wikidata.sparql(query) except NoSPARQLResponse: print("RTRETIME @", self.pp_link_config(link_config)) continue else: print("PARNONE @", self.pp_link_config(link_config)) continue else: try: response = Wikidata.sparql(query) except NoSPARQLResponse: print("TIMEOUT @", self.pp_link_config(link_config)) continue pathes = self.process_response(response) pathes = [Path(path, link_config, item_from, item_to) for path in pathes] pathes = self.filter_pathes(pathes) if len(pathes) == 0: print("NO_CONN @", self.pp_link_config(link_config)) continue print("SUCCESS @", self.pp_link_config(link_config)) if len(pathes) <= 3: for path in pathes: print(path) else: print("[ ... {} paths found ... ]".format( len(pathes))) pathes_at_length += pathes if len(pathes_at_length): if frozenset(direction) in self.solutions: self.solutions[frozenset(direction)] += pathes_at_length else: self.solutions[frozenset(direction)] = pathes_at_length # print processing time info path_length_at_times.append(time.time()) print("-" * 20) for idx, timestamp in list(enumerate(path_length_at_times))[1:]: processing_time = timestamp - path_length_at_times[idx-1] print('TIME AT LENGTH {}: {:.4f}'.format(idx, processing_time, )) for direction, pathes in self.solutions.items(): # print(direction) min_length = min([path.length for path in pathes]) pathes = [path for path in pathes if path.length == min_length] # pathes = sorted(pathes, key=lambda x: x.length) # for path in pathes: # print(path) # print(self.solutions) return self.solutions