Ejemplo n.º 1
0
    def get_qualifiers(self, variable_qnode, qualifier_labels=None):
        if qualifier_labels is not None:
            formatted_qualifiers = self.format_sql_string(qualifier_labels)

            # noinspection SqlNoDataSourceInspection
            qualifier_query = f"""select e_qual.node1, e_qual.node2
                                    from edges e_var
                                    join edges e_qual ON e_var.node2 = e_qual.node1
                                    where e_var.label = 'P2006020002'
                                    and e_qual.label = 'label'
                                    and e_qual.node2 in ({formatted_qualifiers})
                                and e_var.node1 = '{variable_qnode}'"""

            qualifier_results = query_to_dicts(qualifier_query)

            _ = {}
            for r in qualifier_results:
                _[r['node2']] = r['node1']
            return _
        else:
            # noinspection SqlNoDataSourceInspection
            qualifier_query = f"""select e_qual.node1, e_qual.node2
                                    from edges e_var
                                    join edges e_qual ON e_var.node2 = e_qual.node1
                                    where e_var.label = 'P2006020002'
                                    and e_var.node1 = '{variable_qnode}'"""

            qualifier_results = query_to_dicts(qualifier_query)

            _ = {}
            for r in qualifier_results:
                _[r['node1']] = 1
            return _
Ejemplo n.º 2
0
    def create_new_qnodes(self, dataset_id, values, q_type):
        # q_type can be 'Unit' or 'Source' for now
        # create qnodes for units with this scheme {dataset_id}Unit-{d}
        if q_type == 'Unit':
            # noinspection SqlNoDataSourceInspection
            _query = "SELECT max(e.node1) as qnode_max FROM edges e WHERE e.label = 'P31' and e.node2 = 'Q47574' and" \
                     " e.node1 like '{}{}-%'".format(dataset_id, q_type)
        else:
            # noinspection SqlNoDataSourceInspection
            _query = "SELECT max(e.node1) as qnode_max FROM edges e WHERE e.node1 like '{}{}-%'".format(
                dataset_id, q_type)

        _result = query_to_dicts(_query)[0]
        qnode_max = _result['qnode_max']

        regex_key = '({}{}-)(\d*)'.format(dataset_id, q_type)
        if regex_key not in self.qnode_regex:
            self.qnode_regex[regex_key] = re.compile(regex_key)
        _regex = self.qnode_regex[regex_key]
        if not qnode_max:
            seed = 0
        else:
            u, v = _regex.match(qnode_max).groups()
            seed = int(v) + 1
        _dict = {}
        for value in values:
            _dict[value] = '{}{}-{}'.format(dataset_id, q_type, seed)
            seed += 1
        return _dict
Ejemplo n.º 3
0
def fuzzy_query_variables(questions: List[str], regions: Dict[str, List[str]], limit: int, debug=False):
    def get_region_where():
        # Adds the where clause for regions specified in the regions dict
        # We have two EXIST clauses per admin type - one for variables whose main_subject is the location,
        # and one for variables whose location is a qualifier.
        admin_wheres = []
        for admin, qnodes in regions.items():
            if not qnodes:
                continue
            qnode_list = ', '.join([f"'{qnode}'" for qnode in qnodes])
            view_name = search_views.get_view_name(admin)
            one_where = f"EXISTS (SELECT 1 FROM {view_name} WHERE {view_name}.variable_id=e_var_name.node2 AND {view_name}.dataset_qnode=e_dataset.node1 AND {view_name}.{admin}_qnode IN ({qnode_list}))"
            admin_wheres.append(one_where)

        if not admin_wheres:
            return "1=1"
        return '\nOR '.join(admin_wheres)

    if not questions:
        return region_only_query_variables(get_region_where(), limit, debug)

    if debug:
        print('questions:', questions)
    sanitized = [sanitize(question) for question in questions]
    ts_queries = [f"plainto_tsquery('{question}')" for question in sanitized]
    if debug:
        print('ts_queries', ts_queries)
    combined_ts_query = '(' + ' || '.join(ts_queries) + ')'
    if debug:
        print('combined_ts_query:', combined_ts_query)

    region_where = get_region_where()

    # Use Postgres's full text search capabilities
    sql = f"""
    SELECT fuzzy.variable_id, fuzzy.dataset_qnode, fuzzy.name,  ts_rank(variable_text, {combined_ts_query}) AS rank FROM
        (SELECT e_var_name.node2 AS variable_id,
                -- e_dataset_name.node2 AS dataset_id,
                e_dataset.node1 AS dataset_qnode,
                to_tsvector(CONCAT(s_description.text, ' ', s_name.text, ' ', s_label.text)) AS variable_text,
                CONCAT(s_name.text, ' ', s_label.text) as name
            FROM edges e_var
            JOIN edges e_var_name ON (e_var_name.node1=e_var.node1 AND e_var_name.label='P1813')
            JOIN edges e_dataset ON (e_dataset.label='P2006020003' AND e_dataset.node2=e_var.node1)
                    -- JOIN edges e_dataset_name ON (e_dataset_name.node1=e_dataset.node1 AND e_dataset_name.label='P1813')
            LEFT JOIN edges e_description JOIN strings s_description ON (e_description.id=s_description.edge_id) ON (e_var.node1=e_description.node1 AND e_description.label='description')
            LEFT JOIN edges e_name JOIN strings s_name ON (e_name.id=s_name.edge_id) ON (e_var.node1=e_name.node1 AND e_name.label='P1813')
            LEFT JOIN edges e_label JOIN strings s_label ON (e_label.id=s_label.edge_id) ON (e_var.node1=e_label.node1 AND e_label.label='label')

        WHERE e_var.label='P31' AND e_var.node2='Q50701' AND ({region_where})) AS fuzzy
    WHERE variable_text @@ {combined_ts_query}
    ORDER BY rank DESC
    LIMIT {limit}
    """
    if debug:
        print(sql)
    results = query_to_dicts(sql)

    return results
Ejemplo n.º 4
0
def query_variable_data(dataset_id, property_id, places: Dict[str, List[str]], qualifiers, limit, cols, debug=False) -> \
List[Dict[str, Any]]:
    dataset_id = sanitize(dataset_id)
    property_id = sanitize(property_id)

    location_qualifiers = [q for q in qualifiers if q.data_type == 'location']
    if len(location_qualifiers) == 0:
        location_node = 'e_main.node1'
    elif len(location_qualifiers) == 1:
        location_node = 'e_location.node2'
    else:
        raise ValueError(
            "There are more than one location qualifiers for variable")

    places_join, places_where = preprocess_places(places, location_node)
    qualifier_fields, qualifier_joins = preprocess_qualifiers(qualifiers, cols)

    query = f"""
    SELECT  e_main.node1 AS main_subject_id,
            s_main_label.text AS main_subject,
            e_dataset.node2 AS dataset_id,
            q_main.number AS value,
            s_value_unit.text AS value_unit,
            {qualifier_fields}
    FROM edges AS e_main
        JOIN quantities AS q_main ON (e_main.id=q_main.edge_id)
        JOIN edges AS e_dataset ON (e_dataset.node1=e_main.id AND e_dataset.label='P2006020004')
        {qualifier_joins}
        {places_join}
        LEFT JOIN edges AS e_value_unit
            LEFT JOIN strings AS s_value_unit ON (e_value_unit.id=s_value_unit.edge_id)
        ON (e_value_unit.node1=q_main.unit AND e_value_unit.label='label')
        LEFT JOIN edges AS e_main_label
            JOIN strings AS s_main_label ON (e_main_label.id=s_main_label.edge_id)
        ON (e_main.node1=e_main_label.node1 AND e_main_label.label='label')

    WHERE e_main.label='{property_id}' AND e_dataset.node2='{dataset_id}' AND ({places_where})
    ORDER BY main_subject_id, time
    """

    # Some remarks on that query:
    # to_json(d_value_date.date_and_time)... is a recommended way to convert dates to ISO format.
    #
    # Since coordinates are optional, we LEFT JOIN on *A JOIN* of e_coordinate and c_coordinate. The weird
    # syntax of T LEFT JOIN A JOIN B ON (...) ON (...) is the SQL way of explicity specifying which INNER
    # JOINS are LEFT JOINed.
    #
    # We use the || operator on fields from the LEFT JOIN, since x || NULL is NULL in SQL, so coordinate is
    # NULL in the result if there is no coordinate

    if limit > 0:
        query += f"\nLIMIT {limit}\n"
    if debug:
        print(query)

    return query_to_dicts(query)
Ejemplo n.º 5
0
def query_dataset_metadata(dataset_name=None,
                           include_dataset_qnode=False,
                           debug=False):
    """ Returns the metadata of the dataset. If no name is provided, all datasets are returned """

    # Shortcut to helper, with only the parameters we need
    def join_edge(alias, label, satellite_type=None, left=False):
        return _join_edge_helper('e_dataset', alias, label, satellite_type,
                                 left)

    if dataset_name:
        dataset_id = get_dataset_id(dataset_name)  # Already calls sanitize
        if not dataset_id:
            return None

        filter = f"e_dataset.node1='{dataset_id}'"
    else:
        filter = '1=1'

    # query = f'''
    # SELECT e_dataset.node1 AS dataset_id,
    #        s_name.text AS name,
    #        s_description.text AS description,
    #        s_url.text AS url,
    #        s_short_name.text AS short_name

    #     FROM edges e_dataset
    # '''
    if include_dataset_qnode:
        qnode_select = 'e_dataset.node1 as dataset_qnode,'
    else:
        qnode_select = ''
    query = f'''
    SELECT {qnode_select}
            s_short_name.text AS dataset_id,
            s_name.text AS name,
            s_description.text AS description,
            s_url.text AS url


        FROM edges e_dataset
    '''
    # Mandatory First
    query += join_edge('name', 'P1476', 's')
    query += join_edge('description', 'description', 's')
    query += join_edge('url', 'P2699', 's')
    query += join_edge('short_name', 'P1813', 's')

    query += f'''
    WHERE e_dataset.label='P31' AND e_dataset.node2='Q1172284' AND {filter}
    '''

    if debug:
        print(query)
    return query_to_dicts(query)
Ejemplo n.º 6
0
def get_dataset_id(dataset):
    dataset = sanitize(dataset)
    dataset_query = f'''
    SELECT e_dataset.node1 AS dataset_id
        FROM edges e_dataset
    WHERE e_dataset.label='P1813' AND e_dataset.node2='{dataset}';
    '''
    dataset_dicts = query_to_dicts(dataset_query)
    if len(dataset_dicts) > 0:
        return dataset_dicts[0]['dataset_id']
    return None
Ejemplo n.º 7
0
 def run_query(select_clause, join_clause, order_by_clause=""):
     nonlocal from_clause, where_clause
     query = f"""
     {select_clause}
     {from_clause}
     {join_clause}
     {where_clause}
     {order_by_clause}
     """
     if debug:
         print(query)
     return query_to_dicts(query)
Ejemplo n.º 8
0
def get_label(qnode, default=None, lang='en'):
    qnode = sanitize(qnode)
    lang = sanitize(lang)

    label_query = f'''
    SELECT node1 as qnode, text as label
    FROM edges e
    INNER JOIN strings s on e.id = s.edge_id
    WHERE e.node1 = '{qnode}' and e.label = 'label' and s.language='{lang}';
    '''
    label = query_to_dicts(label_query)
    if len(label) > 0:
        return label[0]['label']
    return default
Ejemplo n.º 9
0
def variable_data_exists(dataset_id, property_ids, debug=False):
    # Check whether there is some data for any of the property_ids
    property_ids_str = ', '.join(
        [f"'{property_id}'" for property_id in property_ids])
    query = f"""
            SELECT e_main.id
                FROM edges AS e_main
                JOIN edges AS e_dataset ON (e_dataset.node1=e_main.id AND e_dataset.label='P2006020004')
            WHERE e_main.label IN ({property_ids_str}) AND e_dataset.node2='{dataset_id}'
            LIMIT 1
    """
    if debug:
        print(query)

    result = query_to_dicts(query)
    return len(result) > 0
Ejemplo n.º 10
0
def query_qualifiers(dataset_id, variable_qnode):
    dataset_id = sanitize(dataset_id)
    variable_qnode = sanitize(variable_qnode)
    query = f"""
    SELECT e_qualifier.node2 AS label, s_qualifier_label.text AS name, e_data_type.node2 AS wikidata_data_type
        FROM edges e_var
        JOIN edges e_dataset ON (e_dataset.label='P2006020003' AND e_dataset.node2=e_var.node1)
        JOIN edges e_qualifier ON (e_var.node1=e_qualifier.node1 AND e_qualifier.label='P2006020002')
        LEFT JOIN edges e_qualifier_label  -- Location qualifiers have no name
            JOIN strings s_qualifier_label ON (e_qualifier_label.id=s_qualifier_label.edge_id)
        ON (e_qualifier.node2=e_qualifier_label.node1 AND e_qualifier_label.label='label')
        LEFT JOIN edges e_data_type
            ON (e_qualifier.node2=e_data_type.node1 AND e_data_type.label='wikidata_data_type')

    WHERE e_var.label='P31' AND e_var.node2='Q50701' AND e_dataset.node1='{dataset_id}'  AND e_var.node1='{variable_qnode}'
    """
    qualifiers = query_to_dicts(query)
    return [Qualifier(**q) for q in qualifiers]
Ejemplo n.º 11
0
def next_variable_value(dataset_id, prefix) -> int:
    dataset_id = sanitize(dataset_id)

    query = f'''
    select max(substring(e_variable.node2 from '{prefix}#"[0-9]+#"' for '#')::INTEGER)  from edges e_variable
    where e_variable.node1 in
(
    select e_dataset.node2 from edges e_dataset
    where e_dataset.node1 = '{dataset_id}'
    and e_dataset.label = 'P2006020003'
)
and e_variable.label = 'P1813' and e_variable.node2 similar to '{prefix}[0-9]+';
    '''
    result = query_to_dicts(query)
    if len(result) > 0 and result[0]['max'] is not None:
        number = result[0]['max'] + 1
    else:
        number = 0
    return number
Ejemplo n.º 12
0
def query_variable(dataset, variable):
    dataset = sanitize(dataset)
    variable = sanitize(variable)

    variable_query = f'''
    SELECT e_var.node1 AS variable_qnode, e_var.node2 AS variable_id, s_var_label.text AS variable_name, e_property.node2 AS property_id, e_dataset.node1 AS dataset_id, e_dataset_label.node2 AS dataset_name
        FROM edges e_var
        JOIN edges e_var_label ON (e_var.node1=e_var_label.node1 AND e_var_label.label='label')
        JOIN strings s_var_label ON (e_var_label.id=s_var_label.edge_id)
        JOIN edges e_property ON (e_property.node1=e_var.node1 AND e_property.label='P1687')
        JOIN edges e_dataset ON (e_dataset.label='P2006020003' AND e_dataset.node2=e_property.node1)
        JOIN edges e_dataset_label ON (e_dataset_label.node1=e_dataset.node1 AND e_dataset_label.label='P1813')
    WHERE e_var.label='P1813' AND e_var.node2='{variable}' AND e_dataset_label.node2='{dataset}';
    '''

    variable_dicts = query_to_dicts(variable_query)
    if not len(variable_dicts):
        return None

    return variable_dicts[0]
Ejemplo n.º 13
0
def get_variable_id(dataset_id, variable, debug=False) -> Union[str, None]:
    dataset_id = sanitize(dataset_id)
    variable = sanitize(variable)

    if debug:
        print(f'variable_exists({dataset_id}, {variable})')
    variable_query = f'''
    select e_variable.node1 AS variable_id from edges e_variable
    where e_variable.node1 in
(
    select e_dataset.node2 from edges e_dataset
    where e_dataset.node1 = '{dataset_id}'
    and e_dataset.label = 'P2006020003'
)
and e_variable.label = 'P1813' and e_variable.node2 = '{variable}';
    '''
    variable_dicts = query_to_dicts(variable_query)
    if len(variable_dicts) > 0:
        return variable_dicts[0]['variable_id']
    return None
Ejemplo n.º 14
0
def region_only_query_variables(region_where: str, limit: int, debug=False):
    sql = f"""
    SELECT e_var_name.node2 AS variable_id,
                -- e_dataset_name.node2 AS dataset_id,
                e_dataset.node1 AS dataset_qnode,
                to_tsvector(CONCAT(s_description.text, ' ', s_name.text, ' ', s_label.text)) AS variable_text,
                CONCAT(s_name.text, ' ', s_label.text) as name
            FROM edges e_var
            JOIN edges e_var_name ON (e_var_name.node1=e_var.node1 AND e_var_name.label='P1813')
            JOIN edges e_dataset ON (e_dataset.label='P2006020003' AND e_dataset.node2=e_var.node1)
                    -- JOIN edges e_dataset_name ON (e_dataset_name.node1=e_dataset.node1 AND e_dataset_name.label='P1813')
            LEFT JOIN edges e_description JOIN strings s_description ON (e_description.id=s_description.edge_id) ON (e_var.node1=e_description.node1 AND e_description.label='description')
            LEFT JOIN edges e_name JOIN strings s_name ON (e_name.id=s_name.edge_id) ON (e_var.node1=e_name.node1 AND e_name.label='P1813')
            LEFT JOIN edges e_label JOIN strings s_label ON (e_label.id=s_label.edge_id) ON (e_var.node1=e_label.node1 AND e_label.label='label')

        WHERE e_var.label='P31' AND e_var.node2='Q50701' AND ({region_where})
        LIMIT {limit}
    """
    if debug:
        print(sql)
    results = query_to_dicts(sql)

    return results
Ejemplo n.º 15
0
def does_view_exists(conn, admin):
    view_name = get_view_name(admin)
    query = f"SELECT * FROM pg_matviews WHERE matviewname='{view_name}'"
    result = query_to_dicts(query, conn)
    return len(result)
Ejemplo n.º 16
0
    def canonical_data(self, dataset, variable, is_request_put=True):
        wikify = request.args.get('wikify', 'false').lower() == 'true'

        # check if the dataset exists
        dataset_id = dal.get_dataset_id(dataset)

        if not dataset_id:
            return {'Error': 'Dataset not found: {}'.format(dataset)}, 404

        # check if variable exists for the dataset
        # P2006020003 - Variable Measured
        # P1687 - Corresponds to property

        # noinspection SqlNoDataSourceInspection
        variable_query = f"""Select e.node1, e.node2 from edges e where e.node1 in (
                                select e_variable.node1 from edges e_variable
                                        where e_variable.node1 in
                                    (
                                        select e_dataset.node2 from edges e_dataset
                                        where e_dataset.node1 = '{dataset_id}'
                                        and e_dataset.label = 'P2006020003'
                                    )
                                    and e_variable.label = 'P1813' and e_variable.node2 = '{variable}'
                                )
                                and e.label = 'P1687'
                                    """

        variable_result = query_to_dicts(variable_query)
        if len(variable_result) == 0:
            return {
                'error':
                'Variable: {} not found for the dataset: {}'.format(
                    variable, dataset)
            }, 404

        variable_pnode = variable_result[0]['node2']
        variable_qnode = variable_result[0]['node1']

        kgtk_format_list = list()

        # dataset and variable has been found, wikify and upload the data
        df = pd.read_csv(request.files['file'], dtype=object).fillna('')
        column_map = {}
        _d_columns = list(df.columns)
        for c in _d_columns:
            column_map[c] = c.lower().strip()

        df = df.rename(columns=column_map)

        d_columns = list(df.columns)

        unsanitized_qualifier_columns = [
            x for x in d_columns if x not in self.non_qualifier_columns
            and x not in self.required_fields
        ]
        s_qualifier_columns = {}
        for qc in unsanitized_qualifier_columns:
            s_qualifier_columns[qc] = sanitize(qc)

        df = df.rename(columns=s_qualifier_columns)
        d_columns = list(df.columns)

        qualifier_columns = [
            s_qualifier_columns[k] for k in s_qualifier_columns
        ]
        qualifer_dict = {}
        if qualifier_columns:
            # extra columns in the file, qualifier time
            # first see if any qualifiers already exist
            qualifer_dict = self.get_qualifiers(
                variable_qnode, qualifier_labels=qualifier_columns)
            qualifier_to_be_created = [
                x for x in qualifier_columns if x not in qualifer_dict
            ]
            qualifier_edges, new_q_dict = self.create_qualifier_edges(
                qualifier_to_be_created, variable_qnode)
            kgtk_format_list.extend(qualifier_edges)
            qualifer_dict.update(new_q_dict)

        existing_qualifiers = self.get_qualifiers(variable_qnode)
        extra_qualifiers = []
        if 'P585' not in existing_qualifiers:
            extra_qualifiers.append('P585')
        if 'P248' not in existing_qualifiers:
            extra_qualifiers.append('P248')
        extra_qualifier_edges, _ = self.create_qualifier_edges(
            extra_qualifiers, variable_qnode)
        kgtk_format_list.extend(extra_qualifier_edges)

        # validate file headers first
        validator_header_log, valid_file = self.validate_headers(df)
        if not valid_file:
            return validator_header_log, 400

        countries = list(df['country'].unique())

        if 'main_subject_id' not in d_columns or wikify:
            main_subjects = list(df['main_subject'].unique())
            main_subjects_wikified = self.country_wikifier.wikify(
                main_subjects)
            all_invalid = all([
                main_subjects_wikified[k] is None
                for k in main_subjects_wikified
            ])
            _country = df['country']
            if all_invalid and countries[0].strip().lower() == 'ethiopia':
                # must be Ethiopia regions
                df = self.ethiopia_wikifier.produce(
                    input_df=df,
                    target_column='main_subject',
                    output_column_name='main_subject_id')

            else:
                df['main_subject_id'] = df['main_subject'].map(
                    lambda x: main_subjects_wikified[x])

        if 'country_id' not in d_columns or wikify:
            countries_wikified = self.country_wikifier.wikify(countries)
            df['country_id'] = df['country'].map(
                lambda x: countries_wikified[x])

        # validate file contents
        validator_file_log, valid_file = self.validate_input_file(
            df, dataset, variable)
        if not valid_file:
            return validator_file_log, 400

        if 'value_unit' in d_columns and ('value_unit_id' not in d_columns
                                          or wikify):
            units = list(df['value_unit'].unique())

            # noinspection SqlNoDataSourceInspection
            units_query = "SELECT e.node1, e.node2 FROM edges e WHERE e.node2 in ({}) and e.label = 'label'".format(
                self.format_sql_string(units))

            units_results = query_to_dicts(units_query)

            unit_qnode_dict = {}

            for ur in units_results:
                unit_qnode_dict[ur['node2']] = ur['node1']

            no_qnode_units = list()
            no_qnode_units.extend(
                [u for u in units if u not in unit_qnode_dict])

            no_unit_qnode_dict = self.create_new_qnodes(
                dataset_id, no_qnode_units, 'Unit')

            df['value_unit_id'] = df['value_unit'].map(
                lambda x: unit_qnode_dict[x]
                if x in unit_qnode_dict else no_unit_qnode_dict[x])

            # create rows for new created variables Unit Qnodes and Source Qnodes
            for k in no_unit_qnode_dict:
                _q = no_unit_qnode_dict[k]
                kgtk_format_list.append(
                    self.create_triple(_q, 'label', json.dumps(k)))
                kgtk_format_list.append(self.create_triple(
                    _q, 'P31', 'Q47574'))  # unit of measurement

        if 'source' in d_columns and ('source_id' not in d_columns or wikify):
            sources = list(df['source'].unique())

            # noinspection SqlNoDataSourceInspection
            sources_query = "SELECT  e.node1, e.node2 FROM edges e WHERE e.label = 'label' and e.node2 in  ({})".format(
                self.format_sql_string(sources))

            sources_results = query_to_dicts(sources_query)

            source_qnode_dict = {}
            for sr in sources_results:
                source_qnode_dict[sr['node2']] = sr['node1']

            no_qnode_sources = list()
            no_qnode_sources.extend(
                [s for s in sources if s not in source_qnode_dict])

            no_source_qnode_dict = self.create_new_qnodes(
                dataset_id, no_qnode_sources, 'Source')

            df['source_id'] = df['source'].map(lambda x: source_qnode_dict[
                x] if x in source_qnode_dict else no_source_qnode_dict[x])
            for k in no_source_qnode_dict:
                kgtk_format_list.append(
                    self.create_triple(no_source_qnode_dict[k], 'label',
                                       json.dumps(k)))

        for i, row in df.iterrows():
            kgtk_format_list.extend(
                self.create_kgtk_measurements(row, dataset_id, variable_pnode,
                                              qualifer_dict))

        if is_request_put:
            # this is a PUT request, delete all data for this variable and upload the current data
            self.vd.delete(dataset, variable)

        df_kgtk = pd.DataFrame(kgtk_format_list)
        import_kgtk_dataframe(df_kgtk)

        return '{} rows imported!'.format(len(df)), 201  # original file