def get_qualifiers(self, variable_qnode, qualifier_labels=None): if qualifier_labels is not None: formatted_qualifiers = self.format_sql_string(qualifier_labels) # noinspection SqlNoDataSourceInspection qualifier_query = f"""select e_qual.node1, e_qual.node2 from edges e_var join edges e_qual ON e_var.node2 = e_qual.node1 where e_var.label = 'P2006020002' and e_qual.label = 'label' and e_qual.node2 in ({formatted_qualifiers}) and e_var.node1 = '{variable_qnode}'""" qualifier_results = query_to_dicts(qualifier_query) _ = {} for r in qualifier_results: _[r['node2']] = r['node1'] return _ else: # noinspection SqlNoDataSourceInspection qualifier_query = f"""select e_qual.node1, e_qual.node2 from edges e_var join edges e_qual ON e_var.node2 = e_qual.node1 where e_var.label = 'P2006020002' and e_var.node1 = '{variable_qnode}'""" qualifier_results = query_to_dicts(qualifier_query) _ = {} for r in qualifier_results: _[r['node1']] = 1 return _
def create_new_qnodes(self, dataset_id, values, q_type): # q_type can be 'Unit' or 'Source' for now # create qnodes for units with this scheme {dataset_id}Unit-{d} if q_type == 'Unit': # noinspection SqlNoDataSourceInspection _query = "SELECT max(e.node1) as qnode_max FROM edges e WHERE e.label = 'P31' and e.node2 = 'Q47574' and" \ " e.node1 like '{}{}-%'".format(dataset_id, q_type) else: # noinspection SqlNoDataSourceInspection _query = "SELECT max(e.node1) as qnode_max FROM edges e WHERE e.node1 like '{}{}-%'".format( dataset_id, q_type) _result = query_to_dicts(_query)[0] qnode_max = _result['qnode_max'] regex_key = '({}{}-)(\d*)'.format(dataset_id, q_type) if regex_key not in self.qnode_regex: self.qnode_regex[regex_key] = re.compile(regex_key) _regex = self.qnode_regex[regex_key] if not qnode_max: seed = 0 else: u, v = _regex.match(qnode_max).groups() seed = int(v) + 1 _dict = {} for value in values: _dict[value] = '{}{}-{}'.format(dataset_id, q_type, seed) seed += 1 return _dict
def fuzzy_query_variables(questions: List[str], regions: Dict[str, List[str]], limit: int, debug=False): def get_region_where(): # Adds the where clause for regions specified in the regions dict # We have two EXIST clauses per admin type - one for variables whose main_subject is the location, # and one for variables whose location is a qualifier. admin_wheres = [] for admin, qnodes in regions.items(): if not qnodes: continue qnode_list = ', '.join([f"'{qnode}'" for qnode in qnodes]) view_name = search_views.get_view_name(admin) one_where = f"EXISTS (SELECT 1 FROM {view_name} WHERE {view_name}.variable_id=e_var_name.node2 AND {view_name}.dataset_qnode=e_dataset.node1 AND {view_name}.{admin}_qnode IN ({qnode_list}))" admin_wheres.append(one_where) if not admin_wheres: return "1=1" return '\nOR '.join(admin_wheres) if not questions: return region_only_query_variables(get_region_where(), limit, debug) if debug: print('questions:', questions) sanitized = [sanitize(question) for question in questions] ts_queries = [f"plainto_tsquery('{question}')" for question in sanitized] if debug: print('ts_queries', ts_queries) combined_ts_query = '(' + ' || '.join(ts_queries) + ')' if debug: print('combined_ts_query:', combined_ts_query) region_where = get_region_where() # Use Postgres's full text search capabilities sql = f""" SELECT fuzzy.variable_id, fuzzy.dataset_qnode, fuzzy.name, ts_rank(variable_text, {combined_ts_query}) AS rank FROM (SELECT e_var_name.node2 AS variable_id, -- e_dataset_name.node2 AS dataset_id, e_dataset.node1 AS dataset_qnode, to_tsvector(CONCAT(s_description.text, ' ', s_name.text, ' ', s_label.text)) AS variable_text, CONCAT(s_name.text, ' ', s_label.text) as name FROM edges e_var JOIN edges e_var_name ON (e_var_name.node1=e_var.node1 AND e_var_name.label='P1813') JOIN edges e_dataset ON (e_dataset.label='P2006020003' AND e_dataset.node2=e_var.node1) -- JOIN edges e_dataset_name ON (e_dataset_name.node1=e_dataset.node1 AND e_dataset_name.label='P1813') LEFT JOIN edges e_description JOIN strings s_description ON (e_description.id=s_description.edge_id) ON (e_var.node1=e_description.node1 AND e_description.label='description') LEFT JOIN edges e_name JOIN strings s_name ON (e_name.id=s_name.edge_id) ON (e_var.node1=e_name.node1 AND e_name.label='P1813') LEFT JOIN edges e_label JOIN strings s_label ON (e_label.id=s_label.edge_id) ON (e_var.node1=e_label.node1 AND e_label.label='label') WHERE e_var.label='P31' AND e_var.node2='Q50701' AND ({region_where})) AS fuzzy WHERE variable_text @@ {combined_ts_query} ORDER BY rank DESC LIMIT {limit} """ if debug: print(sql) results = query_to_dicts(sql) return results
def query_variable_data(dataset_id, property_id, places: Dict[str, List[str]], qualifiers, limit, cols, debug=False) -> \ List[Dict[str, Any]]: dataset_id = sanitize(dataset_id) property_id = sanitize(property_id) location_qualifiers = [q for q in qualifiers if q.data_type == 'location'] if len(location_qualifiers) == 0: location_node = 'e_main.node1' elif len(location_qualifiers) == 1: location_node = 'e_location.node2' else: raise ValueError( "There are more than one location qualifiers for variable") places_join, places_where = preprocess_places(places, location_node) qualifier_fields, qualifier_joins = preprocess_qualifiers(qualifiers, cols) query = f""" SELECT e_main.node1 AS main_subject_id, s_main_label.text AS main_subject, e_dataset.node2 AS dataset_id, q_main.number AS value, s_value_unit.text AS value_unit, {qualifier_fields} FROM edges AS e_main JOIN quantities AS q_main ON (e_main.id=q_main.edge_id) JOIN edges AS e_dataset ON (e_dataset.node1=e_main.id AND e_dataset.label='P2006020004') {qualifier_joins} {places_join} LEFT JOIN edges AS e_value_unit LEFT JOIN strings AS s_value_unit ON (e_value_unit.id=s_value_unit.edge_id) ON (e_value_unit.node1=q_main.unit AND e_value_unit.label='label') LEFT JOIN edges AS e_main_label JOIN strings AS s_main_label ON (e_main_label.id=s_main_label.edge_id) ON (e_main.node1=e_main_label.node1 AND e_main_label.label='label') WHERE e_main.label='{property_id}' AND e_dataset.node2='{dataset_id}' AND ({places_where}) ORDER BY main_subject_id, time """ # Some remarks on that query: # to_json(d_value_date.date_and_time)... is a recommended way to convert dates to ISO format. # # Since coordinates are optional, we LEFT JOIN on *A JOIN* of e_coordinate and c_coordinate. The weird # syntax of T LEFT JOIN A JOIN B ON (...) ON (...) is the SQL way of explicity specifying which INNER # JOINS are LEFT JOINed. # # We use the || operator on fields from the LEFT JOIN, since x || NULL is NULL in SQL, so coordinate is # NULL in the result if there is no coordinate if limit > 0: query += f"\nLIMIT {limit}\n" if debug: print(query) return query_to_dicts(query)
def query_dataset_metadata(dataset_name=None, include_dataset_qnode=False, debug=False): """ Returns the metadata of the dataset. If no name is provided, all datasets are returned """ # Shortcut to helper, with only the parameters we need def join_edge(alias, label, satellite_type=None, left=False): return _join_edge_helper('e_dataset', alias, label, satellite_type, left) if dataset_name: dataset_id = get_dataset_id(dataset_name) # Already calls sanitize if not dataset_id: return None filter = f"e_dataset.node1='{dataset_id}'" else: filter = '1=1' # query = f''' # SELECT e_dataset.node1 AS dataset_id, # s_name.text AS name, # s_description.text AS description, # s_url.text AS url, # s_short_name.text AS short_name # FROM edges e_dataset # ''' if include_dataset_qnode: qnode_select = 'e_dataset.node1 as dataset_qnode,' else: qnode_select = '' query = f''' SELECT {qnode_select} s_short_name.text AS dataset_id, s_name.text AS name, s_description.text AS description, s_url.text AS url FROM edges e_dataset ''' # Mandatory First query += join_edge('name', 'P1476', 's') query += join_edge('description', 'description', 's') query += join_edge('url', 'P2699', 's') query += join_edge('short_name', 'P1813', 's') query += f''' WHERE e_dataset.label='P31' AND e_dataset.node2='Q1172284' AND {filter} ''' if debug: print(query) return query_to_dicts(query)
def get_dataset_id(dataset): dataset = sanitize(dataset) dataset_query = f''' SELECT e_dataset.node1 AS dataset_id FROM edges e_dataset WHERE e_dataset.label='P1813' AND e_dataset.node2='{dataset}'; ''' dataset_dicts = query_to_dicts(dataset_query) if len(dataset_dicts) > 0: return dataset_dicts[0]['dataset_id'] return None
def run_query(select_clause, join_clause, order_by_clause=""): nonlocal from_clause, where_clause query = f""" {select_clause} {from_clause} {join_clause} {where_clause} {order_by_clause} """ if debug: print(query) return query_to_dicts(query)
def get_label(qnode, default=None, lang='en'): qnode = sanitize(qnode) lang = sanitize(lang) label_query = f''' SELECT node1 as qnode, text as label FROM edges e INNER JOIN strings s on e.id = s.edge_id WHERE e.node1 = '{qnode}' and e.label = 'label' and s.language='{lang}'; ''' label = query_to_dicts(label_query) if len(label) > 0: return label[0]['label'] return default
def variable_data_exists(dataset_id, property_ids, debug=False): # Check whether there is some data for any of the property_ids property_ids_str = ', '.join( [f"'{property_id}'" for property_id in property_ids]) query = f""" SELECT e_main.id FROM edges AS e_main JOIN edges AS e_dataset ON (e_dataset.node1=e_main.id AND e_dataset.label='P2006020004') WHERE e_main.label IN ({property_ids_str}) AND e_dataset.node2='{dataset_id}' LIMIT 1 """ if debug: print(query) result = query_to_dicts(query) return len(result) > 0
def query_qualifiers(dataset_id, variable_qnode): dataset_id = sanitize(dataset_id) variable_qnode = sanitize(variable_qnode) query = f""" SELECT e_qualifier.node2 AS label, s_qualifier_label.text AS name, e_data_type.node2 AS wikidata_data_type FROM edges e_var JOIN edges e_dataset ON (e_dataset.label='P2006020003' AND e_dataset.node2=e_var.node1) JOIN edges e_qualifier ON (e_var.node1=e_qualifier.node1 AND e_qualifier.label='P2006020002') LEFT JOIN edges e_qualifier_label -- Location qualifiers have no name JOIN strings s_qualifier_label ON (e_qualifier_label.id=s_qualifier_label.edge_id) ON (e_qualifier.node2=e_qualifier_label.node1 AND e_qualifier_label.label='label') LEFT JOIN edges e_data_type ON (e_qualifier.node2=e_data_type.node1 AND e_data_type.label='wikidata_data_type') WHERE e_var.label='P31' AND e_var.node2='Q50701' AND e_dataset.node1='{dataset_id}' AND e_var.node1='{variable_qnode}' """ qualifiers = query_to_dicts(query) return [Qualifier(**q) for q in qualifiers]
def next_variable_value(dataset_id, prefix) -> int: dataset_id = sanitize(dataset_id) query = f''' select max(substring(e_variable.node2 from '{prefix}#"[0-9]+#"' for '#')::INTEGER) from edges e_variable where e_variable.node1 in ( select e_dataset.node2 from edges e_dataset where e_dataset.node1 = '{dataset_id}' and e_dataset.label = 'P2006020003' ) and e_variable.label = 'P1813' and e_variable.node2 similar to '{prefix}[0-9]+'; ''' result = query_to_dicts(query) if len(result) > 0 and result[0]['max'] is not None: number = result[0]['max'] + 1 else: number = 0 return number
def query_variable(dataset, variable): dataset = sanitize(dataset) variable = sanitize(variable) variable_query = f''' SELECT e_var.node1 AS variable_qnode, e_var.node2 AS variable_id, s_var_label.text AS variable_name, e_property.node2 AS property_id, e_dataset.node1 AS dataset_id, e_dataset_label.node2 AS dataset_name FROM edges e_var JOIN edges e_var_label ON (e_var.node1=e_var_label.node1 AND e_var_label.label='label') JOIN strings s_var_label ON (e_var_label.id=s_var_label.edge_id) JOIN edges e_property ON (e_property.node1=e_var.node1 AND e_property.label='P1687') JOIN edges e_dataset ON (e_dataset.label='P2006020003' AND e_dataset.node2=e_property.node1) JOIN edges e_dataset_label ON (e_dataset_label.node1=e_dataset.node1 AND e_dataset_label.label='P1813') WHERE e_var.label='P1813' AND e_var.node2='{variable}' AND e_dataset_label.node2='{dataset}'; ''' variable_dicts = query_to_dicts(variable_query) if not len(variable_dicts): return None return variable_dicts[0]
def get_variable_id(dataset_id, variable, debug=False) -> Union[str, None]: dataset_id = sanitize(dataset_id) variable = sanitize(variable) if debug: print(f'variable_exists({dataset_id}, {variable})') variable_query = f''' select e_variable.node1 AS variable_id from edges e_variable where e_variable.node1 in ( select e_dataset.node2 from edges e_dataset where e_dataset.node1 = '{dataset_id}' and e_dataset.label = 'P2006020003' ) and e_variable.label = 'P1813' and e_variable.node2 = '{variable}'; ''' variable_dicts = query_to_dicts(variable_query) if len(variable_dicts) > 0: return variable_dicts[0]['variable_id'] return None
def region_only_query_variables(region_where: str, limit: int, debug=False): sql = f""" SELECT e_var_name.node2 AS variable_id, -- e_dataset_name.node2 AS dataset_id, e_dataset.node1 AS dataset_qnode, to_tsvector(CONCAT(s_description.text, ' ', s_name.text, ' ', s_label.text)) AS variable_text, CONCAT(s_name.text, ' ', s_label.text) as name FROM edges e_var JOIN edges e_var_name ON (e_var_name.node1=e_var.node1 AND e_var_name.label='P1813') JOIN edges e_dataset ON (e_dataset.label='P2006020003' AND e_dataset.node2=e_var.node1) -- JOIN edges e_dataset_name ON (e_dataset_name.node1=e_dataset.node1 AND e_dataset_name.label='P1813') LEFT JOIN edges e_description JOIN strings s_description ON (e_description.id=s_description.edge_id) ON (e_var.node1=e_description.node1 AND e_description.label='description') LEFT JOIN edges e_name JOIN strings s_name ON (e_name.id=s_name.edge_id) ON (e_var.node1=e_name.node1 AND e_name.label='P1813') LEFT JOIN edges e_label JOIN strings s_label ON (e_label.id=s_label.edge_id) ON (e_var.node1=e_label.node1 AND e_label.label='label') WHERE e_var.label='P31' AND e_var.node2='Q50701' AND ({region_where}) LIMIT {limit} """ if debug: print(sql) results = query_to_dicts(sql) return results
def does_view_exists(conn, admin): view_name = get_view_name(admin) query = f"SELECT * FROM pg_matviews WHERE matviewname='{view_name}'" result = query_to_dicts(query, conn) return len(result)
def canonical_data(self, dataset, variable, is_request_put=True): wikify = request.args.get('wikify', 'false').lower() == 'true' # check if the dataset exists dataset_id = dal.get_dataset_id(dataset) if not dataset_id: return {'Error': 'Dataset not found: {}'.format(dataset)}, 404 # check if variable exists for the dataset # P2006020003 - Variable Measured # P1687 - Corresponds to property # noinspection SqlNoDataSourceInspection variable_query = f"""Select e.node1, e.node2 from edges e where e.node1 in ( select e_variable.node1 from edges e_variable where e_variable.node1 in ( select e_dataset.node2 from edges e_dataset where e_dataset.node1 = '{dataset_id}' and e_dataset.label = 'P2006020003' ) and e_variable.label = 'P1813' and e_variable.node2 = '{variable}' ) and e.label = 'P1687' """ variable_result = query_to_dicts(variable_query) if len(variable_result) == 0: return { 'error': 'Variable: {} not found for the dataset: {}'.format( variable, dataset) }, 404 variable_pnode = variable_result[0]['node2'] variable_qnode = variable_result[0]['node1'] kgtk_format_list = list() # dataset and variable has been found, wikify and upload the data df = pd.read_csv(request.files['file'], dtype=object).fillna('') column_map = {} _d_columns = list(df.columns) for c in _d_columns: column_map[c] = c.lower().strip() df = df.rename(columns=column_map) d_columns = list(df.columns) unsanitized_qualifier_columns = [ x for x in d_columns if x not in self.non_qualifier_columns and x not in self.required_fields ] s_qualifier_columns = {} for qc in unsanitized_qualifier_columns: s_qualifier_columns[qc] = sanitize(qc) df = df.rename(columns=s_qualifier_columns) d_columns = list(df.columns) qualifier_columns = [ s_qualifier_columns[k] for k in s_qualifier_columns ] qualifer_dict = {} if qualifier_columns: # extra columns in the file, qualifier time # first see if any qualifiers already exist qualifer_dict = self.get_qualifiers( variable_qnode, qualifier_labels=qualifier_columns) qualifier_to_be_created = [ x for x in qualifier_columns if x not in qualifer_dict ] qualifier_edges, new_q_dict = self.create_qualifier_edges( qualifier_to_be_created, variable_qnode) kgtk_format_list.extend(qualifier_edges) qualifer_dict.update(new_q_dict) existing_qualifiers = self.get_qualifiers(variable_qnode) extra_qualifiers = [] if 'P585' not in existing_qualifiers: extra_qualifiers.append('P585') if 'P248' not in existing_qualifiers: extra_qualifiers.append('P248') extra_qualifier_edges, _ = self.create_qualifier_edges( extra_qualifiers, variable_qnode) kgtk_format_list.extend(extra_qualifier_edges) # validate file headers first validator_header_log, valid_file = self.validate_headers(df) if not valid_file: return validator_header_log, 400 countries = list(df['country'].unique()) if 'main_subject_id' not in d_columns or wikify: main_subjects = list(df['main_subject'].unique()) main_subjects_wikified = self.country_wikifier.wikify( main_subjects) all_invalid = all([ main_subjects_wikified[k] is None for k in main_subjects_wikified ]) _country = df['country'] if all_invalid and countries[0].strip().lower() == 'ethiopia': # must be Ethiopia regions df = self.ethiopia_wikifier.produce( input_df=df, target_column='main_subject', output_column_name='main_subject_id') else: df['main_subject_id'] = df['main_subject'].map( lambda x: main_subjects_wikified[x]) if 'country_id' not in d_columns or wikify: countries_wikified = self.country_wikifier.wikify(countries) df['country_id'] = df['country'].map( lambda x: countries_wikified[x]) # validate file contents validator_file_log, valid_file = self.validate_input_file( df, dataset, variable) if not valid_file: return validator_file_log, 400 if 'value_unit' in d_columns and ('value_unit_id' not in d_columns or wikify): units = list(df['value_unit'].unique()) # noinspection SqlNoDataSourceInspection units_query = "SELECT e.node1, e.node2 FROM edges e WHERE e.node2 in ({}) and e.label = 'label'".format( self.format_sql_string(units)) units_results = query_to_dicts(units_query) unit_qnode_dict = {} for ur in units_results: unit_qnode_dict[ur['node2']] = ur['node1'] no_qnode_units = list() no_qnode_units.extend( [u for u in units if u not in unit_qnode_dict]) no_unit_qnode_dict = self.create_new_qnodes( dataset_id, no_qnode_units, 'Unit') df['value_unit_id'] = df['value_unit'].map( lambda x: unit_qnode_dict[x] if x in unit_qnode_dict else no_unit_qnode_dict[x]) # create rows for new created variables Unit Qnodes and Source Qnodes for k in no_unit_qnode_dict: _q = no_unit_qnode_dict[k] kgtk_format_list.append( self.create_triple(_q, 'label', json.dumps(k))) kgtk_format_list.append(self.create_triple( _q, 'P31', 'Q47574')) # unit of measurement if 'source' in d_columns and ('source_id' not in d_columns or wikify): sources = list(df['source'].unique()) # noinspection SqlNoDataSourceInspection sources_query = "SELECT e.node1, e.node2 FROM edges e WHERE e.label = 'label' and e.node2 in ({})".format( self.format_sql_string(sources)) sources_results = query_to_dicts(sources_query) source_qnode_dict = {} for sr in sources_results: source_qnode_dict[sr['node2']] = sr['node1'] no_qnode_sources = list() no_qnode_sources.extend( [s for s in sources if s not in source_qnode_dict]) no_source_qnode_dict = self.create_new_qnodes( dataset_id, no_qnode_sources, 'Source') df['source_id'] = df['source'].map(lambda x: source_qnode_dict[ x] if x in source_qnode_dict else no_source_qnode_dict[x]) for k in no_source_qnode_dict: kgtk_format_list.append( self.create_triple(no_source_qnode_dict[k], 'label', json.dumps(k))) for i, row in df.iterrows(): kgtk_format_list.extend( self.create_kgtk_measurements(row, dataset_id, variable_pnode, qualifer_dict)) if is_request_put: # this is a PUT request, delete all data for this variable and upload the current data self.vd.delete(dataset, variable) df_kgtk = pd.DataFrame(kgtk_format_list) import_kgtk_dataframe(df_kgtk) return '{} rows imported!'.format(len(df)), 201 # original file