def right_subgraph_relation_3_generation(current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] query_contains_filter = 'filter' in sparql_query.lower() if query_contains_filter: old_entities_ids = questions_generator.get_elements_from_query(sparql_query, [0]) # Find second entity value, substituting the answer variable with the corresponding variable answer_var_name = re.findall(r'SELECT (\?\w*) WHERE', sparql_query, re.IGNORECASE)[0] entity_var_name = re.findall(r' (\?\w*) filter', sparql_query, re.IGNORECASE)[0] sparql_query_entity = sparql_query.replace(answer_var_name, entity_var_name, 1) old_entities_ids.append(next(iter(questions_generator.get_sparql_query_results(sparql_query_entity)['results']['bindings'][0].values()))['value'].split("/")[-1]) else: old_entities_ids = questions_generator.get_elements_from_query(sparql_query, [0, 1]) old_properties_ids = questions_generator.get_elements_from_query(sparql_query, [0, 1], True) old_answer_id = next(iter(questions_generator.get_sparql_query_results(sparql_query)['results']['bindings'][0].values()))['value'].split("/")[-1] if query_contains_filter: second_query = "select ?ans where {wd:" + old_answer_id + " ?ans ?obj . FILTER (|filter|?obj| && ?ans not in (wdt:|old_property_id|))} LIMIT 20" else: second_query = "select ?ans where {wd:" + old_answer_id + " ?ans ?obj . ?obj wdt:|rel_entity_type| wd:|entity_type| . FILTER (?ans not in (wdt:|old_property_id|))} LIMIT 20" found, old_properties = questions_generator.relation_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, old_properties_ids, \ ["select ?ans where {?sbj ?ans wd:" + old_answer_id + " . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (?ans not in (wdt:|old_property_id|))} LIMIT 20", second_query]) if not found: # There aren't valid candidates, so try with "relation_2" function return right_subgraph_relation_2_generation(current_uid, generated_template, generated_questions, old_properties_ids, old_properties) else: right_subgraph_nnqt_question_construction(generated_template) return generated_template
def right_subgraph_2_relation_3_generation(current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] old_entities_ids = questions_generator.get_elements_from_query(sparql_query, [0]) answer_var_name = re.findall(r'SELECT (\?\w*) WHERE', sparql_query, re.IGNORECASE)[0] entity_var_name = re.findall(r'. (\?\w*) wdt:', sparql_query)[0] modified_sparql_query = sparql_query.replace(answer_var_name, entity_var_name, 1) modified_answer_entity = questions_generator.get_sparql_query_results(modified_sparql_query)['results']['bindings'][0][entity_var_name[1:]]['value'].split("/")[-1] old_entities_ids.append(modified_answer_entity) old_properties_ids = questions_generator.get_elements_from_query(sparql_query, [0, 1], True) old_answer_id = next(iter(questions_generator.get_sparql_query_results(sparql_query)['results']['bindings'][0].values()))['value'].split("/")[-1] modified_answer_filter, _ = questions_generator.get_filter_from_element(modified_answer_entity, "obj", "", False) old_answer_filter, _ = questions_generator.get_filter_from_element(old_answer_id, "sbj", "", False) if modified_answer_filter: first_query = "select ?ans where {?sbj ?ans ?obj . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (" + modified_answer_filter + "?ans not in (wdt:|old_property_id|))} LIMIT 20" else: first_query = "select ?ans where {?sbj ?ans wd:" + modified_answer_entity + " . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (?ans not in (wdt:|old_property_id|))} LIMIT 20" if old_answer_filter: second_query = "select ?ans where {?sbj ?ans ?obj . ?obj wdt:|rel_entity_type| wd:|entity_type| . FILTER (" + old_answer_filter + "?ans not in (wdt:|old_property_id|))} LIMIT 20" else: second_query = "select ?ans where {wd:" + old_answer_id + " ?ans ?obj . ?obj wdt:|rel_entity_type| wd:|entity_type| . FILTER (?ans not in (wdt:|old_property_id|))} LIMIT 20" found, old_properties = questions_generator.relation_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, old_properties_ids, \ [first_query, second_query]) if not found: # There aren't valid candidates, so try with "relation_2" function return right_subgraph_2_relation_2_generation(current_uid, generated_template, generated_questions, old_properties_ids, old_properties) else: right_subgraph_2_nnqt_question_construction(generated_template) return generated_template
def filter_questions(filename: str): with open("dataset/" + filename + ".json", "r") as json_file: json_data = json.load(json_file) if os.path.exists("dataset/" + filename + "_filtered.json"): with open("dataset/" + filename + "_filtered.json", "r") as json_file: filtered_json_data = json.load(json_file) else: filtered_json_data = {"num_filtered_questions": -1, "questions": []} old_filtered_json_data_length = filtered_json_data['num_filtered_questions'] for index, element in enumerate(json_data): # Skip already saved data if index > old_filtered_json_data_length: if index % 100 == 0: print(index) # There are some questions considered malformed, which have to be skipped and so an empty list is returned. The same is done with # queries that cause an Internal Server Error try: results = questions_generator.get_sparql_query_results(element['sparql_wikidata']) except QueryBadFormed: results = [] except URLError: results = questions_generator.get_sparql_query_results(element['sparql_wikidata']) if len(results) > 0: if 'boolean' in results: filtered_json_data['questions'].append(element) elif len(results['results']['bindings']) > 0 and ((element['template_id'] != "Count_1" and element['template_id'] != "Count_2") or \ results['results']['bindings'][0]['value']['value'] != "0"): # Check if there is at least one English answer in cases different from count, otherwise check if the obtained number is different from 0 answers = results['results']['bindings'] # Check all answers, for a maximum of 50 elements, until it is found one with an English label, since getting labels is a slow process and there # could be questions with a lot of possible answers (I found one with more than 14,000 answers) answer = "" i = 0 while i in range(min(50, len(answers))) and not answer: answer_iter = iter(results[i].values()) answer = convert_entity_qid_to_label(next(answer_iter)['value'].split("/")[-1]) # Check if there is a second answer and has an English label, if so add to answers list (DeepPavlov always returns only a single answer). If an answer # has already been found this check is useless if not answer and (not filtered_question['subgraph'] or filtered_question['subgraph'] == "two intentions right subgraph"): answer = convert_entity_qid_to_label(next(answer_iter)['value'].split("/")[-1]) i += 1 if answer: filtered_json_data['questions'].append(element) filtered_json_data['num_filtered_questions'] += 1 # Save temporary results if index % 5000 == 0 and index > 0: with open("dataset/" + filename + "_filtered.json", "w") as json_file: json.dump(filtered_json_data, json_file, indent=2, ensure_ascii=False) # Save only questions list with open("dataset/" + filename + "_filtered.json", "w") as json_file: json.dump(filtered_json_data["questions"], json_file, indent=2, ensure_ascii=False)
def statement_property_2_relation_3_generation(current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] # Check if there is a filter: if not so the last element is an entity query_contains_filter = 'filter' in sparql_query.lower() if query_contains_filter: old_entities_ids = questions_generator.get_elements_from_query(sparql_query, [0]) # There is only one entity used for both properties old_entities_ids.append(old_entities_ids[0]) else: old_entities_ids = questions_generator.get_elements_from_query(sparql_query, [0, 1]) old_properties_ids = [questions_generator.get_specific_elements_from_query(sparql_query, [0], "p", "P")[0]] # The order is inverted because in the queries of this case the first entity is linked to the second property and the second entity is linked to the first property old_properties_ids.insert(0, questions_generator.get_specific_elements_from_query(sparql_query, [0], "pq", "P")[0]) old_answer = next(iter(questions_generator.get_sparql_query_results(sparql_query)['results']['bindings'][0].values()))['value'].split("/")[-1] # Find answer filter and type old_answer_filter, element_type = questions_generator.get_filter_from_element(old_answer, "obj", "s") # The first query is the same for both cases if element_type == questions_generator.ElementType.entity: first_query = "select distinct ?ans where {?sbj ?rel ?s . ?s ?ans wd:" + old_answer + " . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (" + \ old_answer_filter + " && ?ans not in (pq:|old_property_id|))} LIMIT 20" else: # The answer is not an entity first_query = "SELECT distinct ?ans WHERE {?sbj ?rel ?s . ?s ?ans ?obj . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (" + old_answer_filter + \ " && ?ans not in (pq:|old_property_id|))} LIMIT 10" if query_contains_filter: # Find qualifier value, substituting the answer variable with the corresponding variable answer_var_name = re.findall(r'SELECT (\?\w*) WHERE', sparql_query, re.IGNORECASE)[0] qualifier_var_name = re.findall(old_properties_ids[1] + r' (\?\w*) filter', sparql_query, re.IGNORECASE)[0] sparql_query_qualifier = sparql_query.replace(answer_var_name, qualifier_var_name, 1) old_qualifier_value = next(iter(questions_generator.get_sparql_query_results(sparql_query_qualifier)['results']['bindings'][0].values()))['value'].split("/")[-1] # Find qualifier filter old_qualifier_filter, _ = questions_generator.get_filter_from_element(old_qualifier_value, "x", "s") # Since the answer is not an entity, in this case the queries results link the known entity to a value of the same type of the original one: the type is # defined through the corresponding filter. Besides the second query accepts only properties that are "qualifiers", so that are represented with the "p" prefix found, old_properties = questions_generator.relation_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, old_properties_ids, \ [first_query, "SELECT distinct ?ans WHERE { ?sbj ?ans ?s . ?s ?rel2 ?x . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (" + old_qualifier_filter + \ " && ?ans not in (p:|old_property_id|)) } LIMIT 10"], ["pq", ""]) else: # The first query is identical to the other case, the second instead becomes more normal found, old_properties = questions_generator.relation_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, old_properties_ids, \ [first_query, "select distinct ?ans where {wd:" + old_entities_ids[0] + " ?ans ?s . ?s ?rel2 ?obj . ?obj wdt:|rel_entity_type| wd:|entity_type| . FILTER " + \ "(REGEX(STR(?s), \"Q(\\\\d+)-\") && ?ans not in (p:|old_property_id|))} LIMIT 20"], ["pq", ""]) if not found: # There aren't valid candidates, so try with "relation_2" function return statement_property_2_relation_2_generation(current_uid, generated_template, generated_questions, old_properties_ids, old_properties) else: statement_property_2_nnqt_question_construction(generated_template) return generated_template
def simple_question_left_template_generation(current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Tuple[Dict[str, Any], \ str, str, str, str, str]: generated_template = deepcopy(question_template) # There could be properties without an English label, in that case repeat the whole operation; this should be a very rare case while True: # A generated entity possibly couldn't have properties, although it should happen rarely: in that case a new random entity is obtained and the entire operation is repeated while True: # First entity is the object, I made this decision because I saw that doing so is easier to find a valid solution; sbj will still be extracted, because it could # not have an English label first_entity_id, first_entity = wikidata_ids_extractor.get_random_wikidata_entity_from_all( ) possible_questions_data = questions_generator.get_sparql_query_results("select distinct ?rel ?sbj ?sbjLabel ?objio ?objioLabel where {" + "?sbj ?rel wd:" + first_entity_id + " . ?sbj wdt:P31 ?objio . ?sbj rdfs:label ?sbjLabel . ?objio rdfs:label ?objioLabel ." + \ " FILTER (LANG(?sbjLabel) = \"en\" && LANG(?objioLabel) = \"en\")} LIMIT 10") possible_questions_number = len( possible_questions_data['results']['bindings']) if possible_questions_number > 0: break possible_questions_random_index = random.choice( range(possible_questions_number)) question_data = possible_questions_data['results']['bindings'][ possible_questions_random_index] sbj_entity_id = question_data['sbj']['value'].split('/')[-1] rel_property_id = question_data['rel']['value'].split('/')[-1] objio_entity_id = question_data['objio']['value'].split('/')[-1] try: # Get entities and relation English names sbj_entity = question_data['sbjLabel']['value'].replace("_", " ") rel_property = questions_generator.get_entity_name_from_wikidata_id( rel_property_id) objio_entity = question_data['objioLabel']['value'].replace( "_", " ") # Verify if the first answer got with the query is the subject found during question generation. If the query has a lot of results is probable that the first answer will be # in a position different from the first generated_template['sparql_wikidata'] = "select distinct ?sbj where { ?sbj wdt:" + rel_property_id + " wd:" + first_entity_id + " . ?sbj wdt:P31 wd:" + objio_entity_id + \ " . ?sbj rdfs:label ?sbjLabel . FILTER (LANG(?sbjLabel) = \"en\") } LIMIT 5" sparql_result = next( iter( questions_generator.get_sparql_query_results( generated_template['sparql_wikidata'])['results'] ['bindings'][0].values()))['value'].split("/")[-1] if sparql_result == sbj_entity_id: break except: continue nnqt_question = "What is the {" + objio_entity + "} for {" + rel_property + "} of {" + first_entity + "}" generated_template['NNQT_question'] = nnqt_question generated_template['question'] = nnqt_question generated_template['paraphrased_question'] = nnqt_question return generated_template, first_entity_id, first_entity, rel_property_id, rel_property
def check_original_questions(set_name: str): # Load filtered questions with open("dataset/" + set_name + "_filtered.json", "r") as json_file: filtered_questions = json.load(json_file) # Load generated questions with open("dataset/" + set_name + "_generated.json", "r") as json_file: generated_questions = json.load(json_file) # Get the filtered questions used to create the generated questions original_questions = [] for generated_question in generated_questions: for filtered_question in filtered_questions: # For the check we could use respectively only "old_sparql_wikidata" and "sparql_wikidata", but that choice should be longer to execute than the actual one. # This is the reason of the actual checks order, since usually "template_id" and "template_index" values are very short and variable. The last check avoids # duplicates if there are questions generated from the same element if generated_question['template_id'] == filtered_question['template_id'] and generated_question['template_index'] == filtered_question['template_index'] and \ generated_question['subgraph'] == filtered_question['subgraph']: # There are some questions considered malformed, which have to be skipped and so an empty list is returned. The same is done with # queries that cause an Internal Server Error try: results = questions_generator.get_sparql_query_results(filtered_question['sparql_wikidata']) except QueryBadFormed: results = [] except URLError: results = questions_generator.get_sparql_query_results(filtered_question['sparql_wikidata']) if len(results) == 0: # There aren't answers print("(" + str(filtered_question['uid']) + ", " + str(generated_question['uid']) + ")") elif (filtered_question['template_id'] == "Count_1" or filtered_question['template_id'] == "Count_2") and results['results']['bindings'][0]['value']['value'] == "0": # The question is a count and the answer is 0 print("(" + str(filtered_question['uid']) + ", " + str(generated_question['uid']) + ")") else: # Check if there is at least one answer with an English label answers = results['results']['bindings'] # Check all answers, for a maximum of 50 elements, until it is found one with an English label, since getting labels is a slow process and there # could be questions with a lot of possible answers (I found one with more than 14,000 answers) answer = "" i = 0 while i in range(min(50, len(answers))) and not answer: answer_iter = iter(results[i].values()) answer = convert_entity_qid_to_label(next(answer_iter)['value'].split("/")[-1]) # Check if there is a second answer and has an English label, if so add to answers list (DeepPavlov always returns only a single answer). If an answer # has already been found this check is useless if not answer and (not filtered_question['subgraph'] or filtered_question['subgraph'] == "two intentions right subgraph"): answer = convert_entity_qid_to_label(next(answer_iter)['value'].split("/")[-1]) i += 1 if not answer: # All answers don't have an English label print("(" + str(filtered_question['uid']) + ", " + str(generated_question['uid']) + ")") break
def two_intentions_right_subgraph_relation_3_generation( current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] old_entities_ids = questions_generator.get_elements_from_query( sparql_query, [0, 1]) old_properties_ids = questions_generator.get_elements_from_query( sparql_query, [0, 1], True) answers_iter = iter( questions_generator.get_sparql_query_results(sparql_query)['results'] ['bindings'][0].values()) old_answer_id_1 = next(answers_iter)['value'].split("/")[-1] old_answer_id_2 = next(answers_iter)['value'].split("/")[-1] found, old_properties = questions_generator.relation_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, old_properties_ids, \ ["select ?ans where {?sbj ?ans wd:" + old_answer_id_1 + " . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (?ans not in (wdt:|old_property_id|))} LIMIT 20", "select ?ans where {?sbj ?ans wd:" + old_answer_id_2 + " . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (?ans not in (wdt:|old_property_id|))} LIMIT 20"]) if not found: # There aren't valid candidates, so try with "relation_2" function return two_intentions_right_subgraph_relation_2_generation( current_uid, generated_template, generated_questions, old_properties_ids, old_properties) else: two_intentions_right_subgraph_nnqt_question_construction( generated_template) return generated_template
def two_intentions_right_subgraph_entity_3_generation( current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] old_entities_ids = questions_generator.get_elements_from_query( sparql_query, [0, 1]) answers_iter = iter( questions_generator.get_sparql_query_results(sparql_query)['results'] ['bindings'][0].values()) old_answer_id_1 = next(answers_iter)['value'].split("/")[-1] old_answer_id_2 = next(answers_iter)['value'].split("/")[-1] found, old_entities = questions_generator.entity_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, [ "select distinct ?ans ?ansLabel where {?ans ?rel wd:" + old_answer_id_1 + " . ?ans wdt:|rel_entity_type| wd:|entity_type| . ?ans rdfs:label ?ansLabel . " + \ "FILTER (LANG(?ansLabel) = \"en\" && ?ans not in (wd:|old_entity_id|))} LIMIT 20", "select distinct ?ans ?ansLabel where {?ans ?rel wd:" + old_answer_id_2 + \ " . ?ans wdt:|rel_entity_type| wd:|entity_type| . ?ans rdfs:label ?ansLabel . FILTER (LANG(?ansLabel) = \"en\" && ?ans not in (wd:|old_entity_id|))} LIMIT 20"]) if not found: # There aren't valid candidates, so try with a random entity of the same type or class return two_intentions_right_subgraph_entity_2_generation( current_uid, question_template, generated_questions, old_entities_ids, old_entities) else: two_intentions_right_subgraph_nnqt_question_construction( generated_template) return generated_template
def unknown_entity_3_generation(current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] old_entities_ids = questions_generator.get_elements_from_query(sparql_query, [0]) # In this case the procedure is different from normal: it takes the answer query, modifies it to obtain all entities of the same type or class of the # original answer that return results, and then uses this list to exclude these entities from possible candidates general_sparql_query = sparql_query.replace("wd:" + old_entities_ids[0], "?ans") # Get the substring between "{" and "}" general_sparql_query = re.findall("{(.+)}", general_sparql_query)[0] type_common_string = "?ans wdt:|rel_entity_type| wd:|entity_type|" """query = "SELECT distinct ?ans ?ansLabel WHERE {" + type_common_string + " . ?ans rdfs:label ?ansLabel . FILTER(LANG(?ansLabel) = \"en\" " + \ "&& NOT EXISTS {" + type_common_string + " . " + general_sparql_query + "})} LIMIT 20" """ # This version of the query is more complete and adapted for "entity_3" logic, but is also slow and unsafe, since sometimes raises a server error old_answer = next(iter(questions_generator.get_sparql_query_results(sparql_query)['results']['bindings'][0].values()))['value'].split("/")[-1] # Find answer filter and type old_answer_filter, _ = questions_generator.get_filter_from_element(old_answer, "obj", "s", False) query = "SELECT distinct ?ans ?ansLabel WHERE {" + type_common_string + " . ?ans ?rel ?obj . ?ans rdfs:label ?ansLabel . FILTER(LANG(?ansLabel) = \"en\" && " + \ old_answer_filter + "NOT EXISTS {" + type_common_string + " . " + general_sparql_query + "})} LIMIT 5" found, old_entities = questions_generator.entity_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, [query]) if not found: # There aren't valid candidates, so try with a random entity of the same type or class return unknown_entity_2_generation(current_uid, question_template, generated_questions, old_entities_ids, old_entities) else: unknown_nnqt_question_construction(generated_template) return generated_template
def string_matching_simple_contains_word_relation_3_generation( current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] old_entities_ids = questions_generator.get_elements_from_query( sparql_query, [0]) old_properties_ids = questions_generator.get_elements_from_query( sparql_query, [0], True) print(sparql_query) old_answer_id = next( iter( questions_generator.get_sparql_query_results(sparql_query) ['results']['bindings'][0].values()))['value'].split("/")[-1] found, old_properties = questions_generator.relation_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, old_properties_ids, \ ["select ?ans where {wd:" + old_answer_id + " ?ans ?obj . ?obj wdt:|rel_entity_type| wd:|entity_type| . FILTER (?ans not in (wdt:|old_property_id|))} LIMIT 20"]) if not found: # There aren't valid candidates, so try with "relation_2" function return string_matching_simple_contains_word_relation_2_generation( current_uid, generated_template, generated_questions, old_properties_ids, old_properties) else: string_matching_simple_contains_word_nnqt_question_construction( generated_template) return generated_template
def center_2_entity_3_generation( current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] old_entities_ids = questions_generator.get_elements_from_query( sparql_query, [0]) old_answer_id = next( iter( questions_generator.get_sparql_query_results(sparql_query) ['results']['bindings'][0].values()))['value'].split("/")[-1] answer_var_name = re.findall(r'DISTINCT (\?\w*) WHERE', sparql_query, re.IGNORECASE)[0] entity_var_name = re.findall(r' (\?\w*)}', sparql_query)[0] modified_sparql_query = sparql_query.replace(answer_var_name, entity_var_name, 1) modified_answer_entity = questions_generator.get_sparql_query_results( modified_sparql_query)['results']['bindings'][0][ entity_var_name[1:]]['value'].split("/")[-1] modified_answer_filter, _ = questions_generator.get_filter_from_element( modified_answer_entity, "obj", "", False) if modified_answer_filter: query = "select ?ans ?ansLabel where {?ans ?rel ?obj . ?ans wdt:|rel_entity_type| wd:|entity_type| . ?ans rdfs:label ?ansLabel . " + \ "FILTER (" + modified_answer_filter + "LANG(?ansLabel) = \"en\" && ?ans not in (wd:|old_entity_id|))} LIMIT 20" else: query = "select ?ans ?ansLabel where {?ans ?rel wd:" + old_answer_id + " . ?ans wdt:|rel_entity_type| wd:|entity_type| . ?ans rdfs:label ?ansLabel . " + \ "FILTER (LANG(?ansLabel) = \"en\" && ?ans not in (wd:|old_entity_id|))} LIMIT 20" found, old_entities = questions_generator.entity_3_generation_common_part( current_uid, generated_template, generated_questions, old_entities_ids, [query]) if not found: # There aren't valid candidates, so try with a random entity of the same type or class return center_2_entity_2_generation(current_uid, question_template, generated_questions, old_entities_ids, old_entities) else: center_2_nnqt_question_construction(generated_template) return generated_template
def get_sports_json(): sports_query_results = questions_generator.get_sparql_query_results( "select ?sbj ?sbjLabel {{ select ?sbj ?sbjLabel where {?sbj wdt:P31 wd:Q31629 . " + "SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\". }}} FILTER (LANG(?sbjLabel) = \"en\") }" ) sports_list = [] for element in sports_query_results['results']['bindings']: sport_id = element['sbj']['value'].split('/')[-1] sport_title = element['sbjLabel']['value'].lower() sport_element = {"title": sport_title, "q": sport_id} sports_list.append(sport_element) sport_dict = {"*": [{"a": {"*": sports_list}}]} with open("entities_and_properties/Sports.json", "w") as sport_json_file: json.dump(sport_dict, sport_json_file)
def statement_property_nnqt_question_construction(generated_template: Dict[str, Any]): sparql_query = generated_template['old_sparql_wikidata'] if 'filter' in sparql_query.lower(): # Find second entity value, substituting the answer variable with the corresponding variable answer_var_name = re.findall(r'SELECT (\?\w*) WHERE', sparql_query, re.IGNORECASE)[0] entity_var_name = re.findall(r' (\?\w*) filter', sparql_query, re.IGNORECASE)[0] sparql_query_entity = sparql_query.replace(answer_var_name, entity_var_name, 1) fixed_entity = next(iter(questions_generator.get_sparql_query_results(sparql_query_entity)['results']['bindings'][0].values()))['value'].split("/")[-1] # If entity is a date, keep only year-month-day part if questions_generator.get_element_type(fixed_entity) == questions_generator.ElementType.date: fixed_entity = fixed_entity.split("T")[0] fixed_entities = [fixed_entity] questions_generator.recreate_nnqt_question(generated_template, "What is |property_0| of |entity_0| that is |property_1| is |element_0| ?", [0], [0], False, fixed_entities = fixed_entities) else: questions_generator.recreate_nnqt_question(generated_template, "What is |property_0| of |entity_0| that is |property_1| is |entity_1| ?", [0, 1], [0], False)
def unknown_2_relation_3_generation(current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] old_entities_ids = questions_generator.get_elements_from_query(sparql_query, [0, 1]) # The first entity is used for the last two properties old_entities_ids.reverse() old_entities_ids.append(old_entities_ids[0]) old_properties_ids = questions_generator.get_specific_elements_from_query(sparql_query, [0], "p", "P") old_properties_ids += questions_generator.get_specific_elements_from_query(sparql_query, [0, 1], "pq", "P") # Answers are inverted old_answers = iter(questions_generator.get_sparql_query_results(sparql_query)['results']['bindings'][0].values()) old_answer_2 = next(old_answers)['value'].split("/")[-1] old_answer_1 = next(old_answers)['value'].split("/")[-1] # Find answers filter and type old_answer_1_filter, element_type_1 = questions_generator.get_filter_from_element(old_answer_1, "obj", "s") old_answer_2_filter, element_type_2 = questions_generator.get_filter_from_element(old_answer_2, "obj", "s") # These two queries are identical except for the associated property and the answer filter if element_type_1 == questions_generator.ElementType.entity: second_query = "select distinct ?ans where {?sbj ?rel ?s . ?s ?ans wd:" + old_answer_1 + " . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (" + \ old_answer_1_filter + " && ?ans not in (pq:|old_property_id|))} LIMIT 20" else: # If the answer is not an entity, in this case the queries results link the known entity to a value of the same type of the original one: the type is # defined through the corresponding filter second_query = "SELECT distinct ?ans WHERE {?sbj ?rel ?s . ?s ?ans ?obj . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (" + old_answer_1_filter + \ " && ?ans not in (pq:|old_property_id|))} LIMIT 10" if element_type_2 == questions_generator.ElementType.entity: third_query = "select distinct ?ans where {?sbj ?rel ?s . ?s ?ans wd:" + old_answer_2 + " . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (" + \ old_answer_2_filter + " && ?ans not in (pq:|old_property_id|))} LIMIT 20" else: # If the answer is not an entity, in this case the queries results link the known entity to a value of the same type of the original one: the type is # defined through the corresponding filter third_query = "SELECT distinct ?ans WHERE {?sbj ?rel ?s . ?s ?ans ?obj . ?sbj wdt:|rel_entity_type| wd:|entity_type| . FILTER (" + old_answer_2_filter + \ " && ?ans not in (pq:|old_property_id|))} LIMIT 10" # The second query accepts only properties that are "qualifiers", so that are represented with the "pq" prefix found, old_properties = questions_generator.relation_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, old_properties_ids, \ ["select distinct ?ans where {wd:" + old_entities_ids[1] + " ?ans ?s . ?s ?rel2 ?obj . ?obj wdt:|rel_entity_type| wd:|entity_type| . FILTER " + \ "(REGEX(STR(?s), \"Q(\\\\d+)-\") && ?ans not in (p:|old_property_id|))} LIMIT 20", second_query, third_query], ["", "pq", "pq"]) if not found: # There aren't valid candidates, so try with "relation_2" function return unknown_2_relation_2_generation(current_uid, generated_template, generated_questions, old_properties_ids, old_properties) else: unknown_2_nnqt_question_construction(generated_template) return generated_template
def string_matching_simple_contains_word_entity_3_generation( current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] old_entities_ids = questions_generator.get_elements_from_query( sparql_query, [0]) old_answer_id = next( iter( questions_generator.get_sparql_query_results(sparql_query) ['results']['bindings'][0].values()))['value'].split("/")[-1] found, old_entities = questions_generator.entity_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, [ "select ?ans ?ansLabel where {wd:" + old_answer_id + " ?rel ?ans . ?ans wdt:|rel_entity_type| wd:|entity_type| . ?ans rdfs:label ?ansLabel . " + \ "FILTER (LANG(?ansLabel) = \"en\" && ?ans not in (wd:|old_entity_id|))} LIMIT 20"]) if not found: # There aren't valid candidates, so try with a random entity of the same type or class return string_matching_simple_contains_word_entity_2_generation( current_uid, question_template, generated_questions, old_entities_ids, old_entities) else: string_matching_simple_contains_word_nnqt_question_construction( generated_template) return generated_template
def statement_property_entity_3_generation(current_uid: int, question_template: Dict[str, Any], generated_questions: List[Dict[str, Any]]) -> Dict[str, Any]: generated_template = deepcopy(question_template) sparql_query = generated_template['sparql_wikidata'] # Check if there is a filter: if not so the last element is an entity query_contains_filter = 'filter' in sparql_query.lower() if query_contains_filter: old_entities_ids = questions_generator.get_elements_from_query(sparql_query, [0]) else: # If there isn't a filter then the last element is an entity old_entities_ids = questions_generator.get_elements_from_query(sparql_query, [0, 1]) # In this case the procedure is different from normal: it takes the answer query, modifies it to obtain all entities of the same type or class of the # original answer that return results, and then uses this list to exclude these entities from possible candidates general_sparql_query = sparql_query.replace("wd:" + old_entities_ids[0], "?ans") # Get the substring between "{" and "}" general_sparql_query = re.findall("{(.+)}", general_sparql_query)[0] type_common_string = "?ans wdt:|rel_entity_type| wd:|entity_type|" """first_query = "SELECT distinct ?ans ?ansLabel WHERE {" + type_common_string + " . ?ans rdfs:label ?ansLabel . FILTER(LANG(?ansLabel) = \"en\" " + \ "&& NOT EXISTS {" + type_common_string + " . " + general_sparql_query + "})} LIMIT 20" """ # This version of the query is more complete and adapted for "entity_3" logic, but is also slow and unsafe, since sometimes raises a server error old_answer = next(iter(questions_generator.get_sparql_query_results(sparql_query)['results']['bindings'][0].values()))['value'].split("/")[-1] # Find answer filter and type old_answer_filter, _ = questions_generator.get_filter_from_element(old_answer, "obj", "s", False) first_query = "SELECT distinct ?ans ?ansLabel WHERE {" + type_common_string + " . ?ans ?rel ?obj . ?ans rdfs:label ?ansLabel . FILTER(LANG(?ansLabel) = \"en\" && " + \ old_answer_filter + "NOT EXISTS {" + type_common_string + " . " + general_sparql_query + "})} LIMIT 5" if query_contains_filter: found, old_entities = questions_generator.entity_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, [first_query]) else: # In this case there is an additional normal query with two triples that link the first known entity to the second one found, old_entities = questions_generator.entity_3_generation_common_part(current_uid, generated_template, generated_questions, old_entities_ids, [first_query, "select ?ans ?ansLabel where {wd:" + old_entities_ids[0] + " ?rel ?s . ?s ?rel2 ?ans . ?ans wdt:|rel_entity_type| wd:|entity_type| . ?ans rdfs:label ?ansLabel . " + \ "FILTER (LANG(?ansLabel) = \"en\" && REGEX(STR(?s), \"Q(\\\\d+)-\") && ?ans not in (wd:|old_entity_id|))} LIMIT 20"]) if not found: # There aren't valid candidates, so try with a random entity of the same type or class return statement_property_entity_2_generation(current_uid, question_template, generated_questions, old_entities_ids, old_entities) else: statement_property_nnqt_question_construction(generated_template) return generated_template