Ejemplo n.º 1
0
def query_variable_data(dataset_id, property_id, places: Dict[str, List[str]], qualifiers, limit, cols, debug=False) -> \
List[Dict[str, Any]]:
    dataset_id = sanitize(dataset_id)
    property_id = sanitize(property_id)

    location_qualifiers = [q for q in qualifiers if q.data_type == 'location']
    if len(location_qualifiers) == 0:
        location_node = 'e_main.node1'
    elif len(location_qualifiers) == 1:
        location_node = 'e_location.node2'
    else:
        raise ValueError(
            "There are more than one location qualifiers for variable")

    places_join, places_where = preprocess_places(places, location_node)
    qualifier_fields, qualifier_joins = preprocess_qualifiers(qualifiers, cols)

    query = f"""
    SELECT  e_main.node1 AS main_subject_id,
            s_main_label.text AS main_subject,
            e_dataset.node2 AS dataset_id,
            q_main.number AS value,
            s_value_unit.text AS value_unit,
            {qualifier_fields}
    FROM edges AS e_main
        JOIN quantities AS q_main ON (e_main.id=q_main.edge_id)
        JOIN edges AS e_dataset ON (e_dataset.node1=e_main.id AND e_dataset.label='P2006020004')
        {qualifier_joins}
        {places_join}
        LEFT JOIN edges AS e_value_unit
            LEFT JOIN strings AS s_value_unit ON (e_value_unit.id=s_value_unit.edge_id)
        ON (e_value_unit.node1=q_main.unit AND e_value_unit.label='label')
        LEFT JOIN edges AS e_main_label
            JOIN strings AS s_main_label ON (e_main_label.id=s_main_label.edge_id)
        ON (e_main.node1=e_main_label.node1 AND e_main_label.label='label')

    WHERE e_main.label='{property_id}' AND e_dataset.node2='{dataset_id}' AND ({places_where})
    ORDER BY main_subject_id, time
    """

    # Some remarks on that query:
    # to_json(d_value_date.date_and_time)... is a recommended way to convert dates to ISO format.
    #
    # Since coordinates are optional, we LEFT JOIN on *A JOIN* of e_coordinate and c_coordinate. The weird
    # syntax of T LEFT JOIN A JOIN B ON (...) ON (...) is the SQL way of explicity specifying which INNER
    # JOINS are LEFT JOINed.
    #
    # We use the || operator on fields from the LEFT JOIN, since x || NULL is NULL in SQL, so coordinate is
    # NULL in the result if there is no coordinate

    if limit > 0:
        query += f"\nLIMIT {limit}\n"
    if debug:
        print(query)

    return query_to_dicts(query)
Ejemplo n.º 2
0
def query_qualifiers(dataset_id, variable_qnode):
    dataset_id = sanitize(dataset_id)
    variable_qnode = sanitize(variable_qnode)
    query = f"""
    SELECT e_qualifier.node2 AS label, s_qualifier_label.text AS name, e_data_type.node2 AS wikidata_data_type
        FROM edges e_var
        JOIN edges e_dataset ON (e_dataset.label='P2006020003' AND e_dataset.node2=e_var.node1)
        JOIN edges e_qualifier ON (e_var.node1=e_qualifier.node1 AND e_qualifier.label='P2006020002')
        LEFT JOIN edges e_qualifier_label  -- Location qualifiers have no name
            JOIN strings s_qualifier_label ON (e_qualifier_label.id=s_qualifier_label.edge_id)
        ON (e_qualifier.node2=e_qualifier_label.node1 AND e_qualifier_label.label='label')
        LEFT JOIN edges e_data_type
            ON (e_qualifier.node2=e_data_type.node1 AND e_data_type.label='wikidata_data_type')

    WHERE e_var.label='P31' AND e_var.node2='Q50701' AND e_dataset.node1='{dataset_id}'  AND e_var.node1='{variable_qnode}'
    """
    qualifiers = query_to_dicts(query)
    return [Qualifier(**q) for q in qualifiers]
Ejemplo n.º 3
0
def query_variable(dataset, variable):
    dataset = sanitize(dataset)
    variable = sanitize(variable)

    variable_query = f'''
    SELECT e_var.node1 AS variable_qnode, e_var.node2 AS variable_id, s_var_label.text AS variable_name, e_property.node2 AS property_id, e_dataset.node1 AS dataset_id, e_dataset_label.node2 AS dataset_name
        FROM edges e_var
        JOIN edges e_var_label ON (e_var.node1=e_var_label.node1 AND e_var_label.label='label')
        JOIN strings s_var_label ON (e_var_label.id=s_var_label.edge_id)
        JOIN edges e_property ON (e_property.node1=e_var.node1 AND e_property.label='P1687')
        JOIN edges e_dataset ON (e_dataset.label='P2006020003' AND e_dataset.node2=e_property.node1)
        JOIN edges e_dataset_label ON (e_dataset_label.node1=e_dataset.node1 AND e_dataset_label.label='P1813')
    WHERE e_var.label='P1813' AND e_var.node2='{variable}' AND e_dataset_label.node2='{dataset}';
    '''

    variable_dicts = query_to_dicts(variable_query)
    if not len(variable_dicts):
        return None

    return variable_dicts[0]
Ejemplo n.º 4
0
def get_variable_id(dataset_id, variable, debug=False) -> Union[str, None]:
    dataset_id = sanitize(dataset_id)
    variable = sanitize(variable)

    if debug:
        print(f'variable_exists({dataset_id}, {variable})')
    variable_query = f'''
    select e_variable.node1 AS variable_id from edges e_variable
    where e_variable.node1 in
(
    select e_dataset.node2 from edges e_dataset
    where e_dataset.node1 = '{dataset_id}'
    and e_dataset.label = 'P2006020003'
)
and e_variable.label = 'P1813' and e_variable.node2 = '{variable}';
    '''
    variable_dicts = query_to_dicts(variable_query)
    if len(variable_dicts) > 0:
        return variable_dicts[0]['variable_id']
    return None
Ejemplo n.º 5
0
    def canonical_data(self, dataset, variable, is_request_put=True):
        wikify = request.args.get('wikify', 'false').lower() == 'true'

        # check if the dataset exists
        dataset_id = dal.get_dataset_id(dataset)

        if not dataset_id:
            return {'Error': 'Dataset not found: {}'.format(dataset)}, 404

        # check if variable exists for the dataset
        # P2006020003 - Variable Measured
        # P1687 - Corresponds to property

        # noinspection SqlNoDataSourceInspection
        variable_query = f"""Select e.node1, e.node2 from edges e where e.node1 in (
                                select e_variable.node1 from edges e_variable
                                        where e_variable.node1 in
                                    (
                                        select e_dataset.node2 from edges e_dataset
                                        where e_dataset.node1 = '{dataset_id}'
                                        and e_dataset.label = 'P2006020003'
                                    )
                                    and e_variable.label = 'P1813' and e_variable.node2 = '{variable}'
                                )
                                and e.label = 'P1687'
                                    """

        variable_result = query_to_dicts(variable_query)
        if len(variable_result) == 0:
            return {
                'error':
                'Variable: {} not found for the dataset: {}'.format(
                    variable, dataset)
            }, 404

        variable_pnode = variable_result[0]['node2']
        variable_qnode = variable_result[0]['node1']

        kgtk_format_list = list()

        # dataset and variable has been found, wikify and upload the data
        df = pd.read_csv(request.files['file'], dtype=object).fillna('')
        column_map = {}
        _d_columns = list(df.columns)
        for c in _d_columns:
            column_map[c] = c.lower().strip()

        df = df.rename(columns=column_map)

        d_columns = list(df.columns)

        unsanitized_qualifier_columns = [
            x for x in d_columns if x not in self.non_qualifier_columns
            and x not in self.required_fields
        ]
        s_qualifier_columns = {}
        for qc in unsanitized_qualifier_columns:
            s_qualifier_columns[qc] = sanitize(qc)

        df = df.rename(columns=s_qualifier_columns)
        d_columns = list(df.columns)

        qualifier_columns = [
            s_qualifier_columns[k] for k in s_qualifier_columns
        ]
        qualifer_dict = {}
        if qualifier_columns:
            # extra columns in the file, qualifier time
            # first see if any qualifiers already exist
            qualifer_dict = self.get_qualifiers(
                variable_qnode, qualifier_labels=qualifier_columns)
            qualifier_to_be_created = [
                x for x in qualifier_columns if x not in qualifer_dict
            ]
            qualifier_edges, new_q_dict = self.create_qualifier_edges(
                qualifier_to_be_created, variable_qnode)
            kgtk_format_list.extend(qualifier_edges)
            qualifer_dict.update(new_q_dict)

        existing_qualifiers = self.get_qualifiers(variable_qnode)
        extra_qualifiers = []
        if 'P585' not in existing_qualifiers:
            extra_qualifiers.append('P585')
        if 'P248' not in existing_qualifiers:
            extra_qualifiers.append('P248')
        extra_qualifier_edges, _ = self.create_qualifier_edges(
            extra_qualifiers, variable_qnode)
        kgtk_format_list.extend(extra_qualifier_edges)

        # validate file headers first
        validator_header_log, valid_file = self.validate_headers(df)
        if not valid_file:
            return validator_header_log, 400

        countries = list(df['country'].unique())

        if 'main_subject_id' not in d_columns or wikify:
            main_subjects = list(df['main_subject'].unique())
            main_subjects_wikified = self.country_wikifier.wikify(
                main_subjects)
            all_invalid = all([
                main_subjects_wikified[k] is None
                for k in main_subjects_wikified
            ])
            _country = df['country']
            if all_invalid and countries[0].strip().lower() == 'ethiopia':
                # must be Ethiopia regions
                df = self.ethiopia_wikifier.produce(
                    input_df=df,
                    target_column='main_subject',
                    output_column_name='main_subject_id')

            else:
                df['main_subject_id'] = df['main_subject'].map(
                    lambda x: main_subjects_wikified[x])

        if 'country_id' not in d_columns or wikify:
            countries_wikified = self.country_wikifier.wikify(countries)
            df['country_id'] = df['country'].map(
                lambda x: countries_wikified[x])

        # validate file contents
        validator_file_log, valid_file = self.validate_input_file(
            df, dataset, variable)
        if not valid_file:
            return validator_file_log, 400

        if 'value_unit' in d_columns and ('value_unit_id' not in d_columns
                                          or wikify):
            units = list(df['value_unit'].unique())

            # noinspection SqlNoDataSourceInspection
            units_query = "SELECT e.node1, e.node2 FROM edges e WHERE e.node2 in ({}) and e.label = 'label'".format(
                self.format_sql_string(units))

            units_results = query_to_dicts(units_query)

            unit_qnode_dict = {}

            for ur in units_results:
                unit_qnode_dict[ur['node2']] = ur['node1']

            no_qnode_units = list()
            no_qnode_units.extend(
                [u for u in units if u not in unit_qnode_dict])

            no_unit_qnode_dict = self.create_new_qnodes(
                dataset_id, no_qnode_units, 'Unit')

            df['value_unit_id'] = df['value_unit'].map(
                lambda x: unit_qnode_dict[x]
                if x in unit_qnode_dict else no_unit_qnode_dict[x])

            # create rows for new created variables Unit Qnodes and Source Qnodes
            for k in no_unit_qnode_dict:
                _q = no_unit_qnode_dict[k]
                kgtk_format_list.append(
                    self.create_triple(_q, 'label', json.dumps(k)))
                kgtk_format_list.append(self.create_triple(
                    _q, 'P31', 'Q47574'))  # unit of measurement

        if 'source' in d_columns and ('source_id' not in d_columns or wikify):
            sources = list(df['source'].unique())

            # noinspection SqlNoDataSourceInspection
            sources_query = "SELECT  e.node1, e.node2 FROM edges e WHERE e.label = 'label' and e.node2 in  ({})".format(
                self.format_sql_string(sources))

            sources_results = query_to_dicts(sources_query)

            source_qnode_dict = {}
            for sr in sources_results:
                source_qnode_dict[sr['node2']] = sr['node1']

            no_qnode_sources = list()
            no_qnode_sources.extend(
                [s for s in sources if s not in source_qnode_dict])

            no_source_qnode_dict = self.create_new_qnodes(
                dataset_id, no_qnode_sources, 'Source')

            df['source_id'] = df['source'].map(lambda x: source_qnode_dict[
                x] if x in source_qnode_dict else no_source_qnode_dict[x])
            for k in no_source_qnode_dict:
                kgtk_format_list.append(
                    self.create_triple(no_source_qnode_dict[k], 'label',
                                       json.dumps(k)))

        for i, row in df.iterrows():
            kgtk_format_list.extend(
                self.create_kgtk_measurements(row, dataset_id, variable_pnode,
                                              qualifer_dict))

        if is_request_put:
            # this is a PUT request, delete all data for this variable and upload the current data
            self.vd.delete(dataset, variable)

        df_kgtk = pd.DataFrame(kgtk_format_list)
        import_kgtk_dataframe(df_kgtk)

        return '{} rows imported!'.format(len(df)), 201  # original file
Ejemplo n.º 6
0
    def _init_sql(self):
        main_name = self.main_column
        underscored_main_name = sanitize(main_name.replace(' ', '_'))
        main_table = 'e_' + underscored_main_name
        if self.is_optional:
            join_clause = 'LEFT '
        else:
            join_clause = ''
        join_clause += f"JOIN edges {main_table}"
        join_on_clause = f"ON (e_main.id={main_table}.node1 AND {main_table}.label='{self.label}')"
        if self.data_type == 'date_and_time':
            satellite_table = 'd_' + main_name
            satellite_join = f"JOIN dates {satellite_table} ON ({main_table}.id={satellite_table}.edge_id)"
            self.fields = {
                main_name:
                f"to_json({satellite_table}.date_and_time)#>>'{{}}' || 'Z'",
                main_name + "_precision": f"{satellite_table}.precision",
            }
        elif self.data_type == 'quantity':
            satellite_table = 'q_' + underscored_main_name
            unit_table = 'e_' + underscored_main_name + '_unit_label'
            unit_string_table = 's' + unit_table[1:]
            satellite_join = f"""
                JOIN quantities {satellite_table}
                    LEFT JOIN edges {unit_table}
                        LEFT JOIN strings {unit_string_table} ON ({unit_table}.id={unit_string_table}.edge_id)
                    ON ({satellite_table}.unit={unit_table}.node1 AND {unit_table}.label='label')
                ON ({main_table}.id={satellite_table}.edge_id)
            """
            self.fields = {
                main_name: f"{satellite_table}.number",
                main_name + "_unit_id": f"{satellite_table}.unit",
                main_name + "_unit": f"{unit_string_table}.text"
            }
        elif self.data_type == 'symbol' or self.data_type == 'location':  # Locations are just symbols at this point
            label_table = 'e_' + underscored_main_name + '_label'
            label_string_table = 's' + label_table[1:]
            satellite_join = f"""
                JOIN edges {label_table}
                    JOIN strings {label_string_table} ON ({label_table}.id={label_string_table}.edge_id)
                ON ({label_table}.node1={main_table}.node2 AND {label_table}.label='label')"""
            self.fields = {
                main_name: f"{label_string_table}.text",
                main_name + "_id": f"{main_table}.node2"
            }
        elif self.data_type == 'string':
            satellite_table = 's_' + underscored_main_name
            satellite_join = f"""
                JOIN strings {satellite_table} ON ({main_table}.id={satellite_table}.edge_id)
            """
            self.fields = {
                main_name: f"{satellite_table}.text",
            }
        elif self.data_type == 'coordinate':
            satellite_table = 'c_' + underscored_main_name
            satellite_join = f"""
                JOIN coordinates {satellite_table} ON ({main_table}.id={satellite_table}.edge_id)
            """
            self.fields = {
                main_name:
                f"""'POINT(' || {satellite_table}.longitude || ' ' || {satellite_table}.latitude || ')'"""
            }
        else:
            raise ValueError('Qualifiers of type ' + self.data_type +
                             ' are not supported yet')

        self.join_clause = f"""