コード例 #1
0
    def post(self, dataset, variable=None):
        if not request.json:
            content = {'Error': 'JSON content body is empty'}
            return content, 400
        # print('Post variable: ', request.json)

        if variable:
            content = {'Error': 'Please do not supply a variable when POSTing'}
            return content, 400

        metadata: VariableMetadata = VariableMetadata()
        status, code = metadata.from_request(request.json)
        if not code == 200:
            return status, code

        dataset_id = dal.get_dataset_id(dataset)
        if not dataset_id:
            status = {'Error': f'Cannot find dataset {dataset}'}
            return status, 404
        metadata.dataset_id = dataset

        if metadata.variable_id and dal.get_variable_id(
                dataset_id, metadata.variable_id) is not None:
            status = {
                'Error':
                f'Variable {metadata.variable_id} has already been defined in dataset {dataset}'
            }
            return status, 409

        # Create qnode for variable
        if not metadata.variable_id:
            prefix = f'V{metadata.dataset_id}-'
            number = dal.next_variable_value(dataset_id, prefix)
            metadata.variable_id = f'{prefix}{number}'
        variable_id = f'Q{metadata.dataset_id}-{metadata.variable_id}'
        variable_pnode = f'P{metadata.dataset_id}-{metadata.variable_id}'
        metadata._variable_id = variable_id
        metadata.corresponds_to_property = variable_pnode

        # pprint(metadata.to_dict())
        edges = pd.DataFrame(metadata.to_kgtk_edges(dataset_id, variable_id))
        # pprint(edges)

        if 'test' not in request.args:
            import_kgtk_dataframe(edges)

        content = metadata.to_dict()

        if 'tsv' in request.args:
            tsv = edges.to_csv(sep='\t', quoting=csv.QUOTE_NONE, index=False)
            output = make_response(tsv)
            output.headers[
                'Content-Disposition'] = f'attachment; filename={metadata.dataset_id}-{metadata.variable_id}.tsv'
            output.headers['Content-type'] = 'text/tsv'
            return output

        return content, 201
コード例 #2
0
    def get(self, dataset):
        # check if the dataset exists
        dataset_id = dal.get_dataset_id(dataset)

        if not dataset_id:
            return {'Error': 'Dataset not found: {}'.format(dataset)}, 404

        request_variables = request.args.getlist('variable') or []
        include_cols = request.args.getlist('include') or []
        exclude_cols = request.args.getlist('exclude') or []

        limit = 20
        if request.args.get('limit') is not None:
            try:
                limit = int(request.args.get('limit'))
            except:
                pass

        try:
            regions = get_query_region_ids(request.args)
        except UnknownSubjectError as ex:
            return ex.get_error_dict(), 404

        variables_metadata = []
        if len(request_variables) > 0:
            variables = request_variables
            for v in variables:
                variables_metadata.append(self.vmr.get(dataset, variable=v)[0])
        else:
            variables_metadata = self.vmr.get(dataset)[0]

        variables_metadata = variables_metadata[:limit]
        df_list = []

        # for variable in variables_metadata:
        #
        #     _ = self.vg.get_direct(dataset, variable['variable_id'], include_cols, exclude_cols, -1, regions,
        #                            return_df=True)
        #
        #     qualifiers = variable['qualifier']
        #     generic_qualifiers = [x['name'] for x in qualifiers if x['identifier'] not in ('P585', 'P248')]
        #
        #     if _ is not None:
        #         _ = self.reshape_canonical_data(_, generic_qualifiers)
        #         df_list.append(_)
        for variable in variables_metadata:
            df_list.append(self.vg.get_direct(dataset, variable['variable_id'], include_cols, exclude_cols, -1, regions,
                                              return_df=True))

        df = pd.concat(df_list).replace('N/A', '')

        csv = df.to_csv(index=False)
        output = make_response(csv)
        output.headers['Content-Disposition'] = f'attachment; filename={dataset}_variables_all.csv'
        output.headers['Content-type'] = 'text/csv'
        return output
コード例 #3
0
    def create_dataset(metadata: DatasetMetadata, *, create: bool = True):
        # Create qnode
        dataset_id = f'Q{metadata.dataset_id}'
        edges = None
        if dal.get_dataset_id(metadata.dataset_id) is None:
            metadata._dataset_id = dataset_id

            edges = pd.DataFrame(metadata.to_kgtk_edges(dataset_id))

            if create:
                import_kgtk_dataframe(edges)

        return dataset_id, edges
コード例 #4
0
    def post(self, dataset=None):
        if not request.json:
            content = {'Error': 'JSON content body is empty'}
            return content, 400

        if dataset:
            content = {
                'Error': 'Please do not supply a dataset-id when POSTing'
            }
            return content, 400

        request_metadata = request.json

        invalid_metadata = False
        error_report = []
        for key in request_metadata:
            if request_metadata[key].strip() == "":
                error_report.append(
                    {'error': f'Metadata field: {key}, cannot be blank'})
                invalid_metadata = True

        if invalid_metadata:
            return error_report, 400

        metadata = DatasetMetadata()
        status, code = metadata.from_request(request_metadata)
        if not code == 200:
            return status, code

        if dal.get_dataset_id(metadata.dataset_id):
            content = {
                'Error':
                f'Dataset identifier {metadata.dataset_id} has already been used'
            }
            return content, 409

        _, edges = DatasetMetadataResource.create_dataset(metadata,
                                                          create='test'
                                                          not in request.args)

        content = metadata.to_dict()

        if 'tsv' in request.args:
            tsv = edges.to_csv(sep='\t', quoting=csv.QUOTE_NONE, index=False)
            output = make_response(tsv)
            output.headers[
                'Content-Disposition'] = f'attachment; filename={metadata.dataset_id}.tsv'
            output.headers['Content-type'] = 'text/tsv'
            return output

        return content, 201
コード例 #5
0
    def process(self, dataset, is_request_put=True):

        # check if the dataset exists
        dataset_qnode = dal.get_dataset_id(dataset)

        if not dataset_qnode:
            return {'Error': 'Dataset not found: {}'.format(dataset)}, 404

        t2wml_file_name = request.files['kgtk_output'].filename
        item_defs_file_name = request.files['item_definitions'].filename
        if not t2wml_file_name.endswith('.tsv'):
            return {"error": "Please upload a TSV file (T2WML output)"}, 400

        if not item_defs_file_name.endswith('.tsv'):
            return {"error": "Please upload a TSV file (T2WML output)"}, 400

        t2wml_output_df = pd.read_csv(request.files['kgtk_output'],
                                      dtype=object,
                                      sep='\t',
                                      quoting=csv.QUOTE_NONE).fillna('')
        item_defs_df = pd.read_csv(request.files['item_definitions'],
                                   dtype=object,
                                   sep='\t',
                                   quoting=csv.QUOTE_NONE).fillna('')

        df = self.convert_t2wml_files(t2wml_output_df, item_defs_df)
        variable_ids = self.identify_variables(df)

        if is_request_put:
            # delete the variable canonical data and metadata before inserting into databse again!!
            for v in variable_ids:
                print(self.vd.delete(dataset, v))
                print(self.vmr.delete(dataset, v))

        # All good ingest the tsv file into database.
        import_kgtk_dataframe(df, is_file_exploded=True)

        variables_metadata = []
        for v in variable_ids:
            variables_metadata.append(self.vmr.get(dataset, variable=v)[0])

        return variables_metadata, 201
コード例 #6
0
    def process(self, dataset, is_request_put=False):
        l = time()
        validate = request.args.get('validate', 'true').lower() == 'true'
        files_only = request.args.get('files_only', 'false').lower() == 'true'
        create_if_not_exist = request.args.get('create_if_not_exist',
                                               'false').lower() == 'true'

        # check if the dataset exists
        s = time()
        dataset_qnode = dal.get_dataset_id(dataset)
        print(f'time take to get dataset: {time() - s} seconds')

        if not create_if_not_exist and not dataset_qnode:
            print(f'Dataset not defined: {dataset}')
            return {'Error': 'Dataset not found: {}'.format(dataset)}, 404

        file_name = request.files['file'].filename

        if not (file_name.endswith('.xlsx') or file_name.endswith('.csv')):
            return {
                "Error":
                "Please upload an annotated excel file or a csv file "
                "(file name ending with .xlsx or .csv)"
            }, 400

        if file_name.endswith('.xlsx'):
            df = pd.read_excel(request.files['file'],
                               dtype=object,
                               header=None).fillna('')
        elif file_name.endswith('.csv'):
            df = pd.read_csv(request.files['file'], dtype=object,
                             header=None).fillna('')

        if create_if_not_exist and not dataset_qnode:
            try:
                dataset_dict = {
                    'dataset_id': df.iloc[0, 1],
                    'name': df.iloc[0, 2],
                    'description': df.iloc[0, 3],
                    'url': df.iloc[0, 4]
                }
            except Exception as e:
                return {'Error': 'Failed to create dataset: ' + str(e)}, 400

            missing = []
            for key, value in dataset_dict.items():
                if not value:
                    missing.append(key)

            if len(missing) > 0:
                print(f'Dataset metadata missing fields: {missing}')
                return {
                    'Error': f'Dataset metadata missing fields: {missing}'
                }, 404

            metadata = DatasetMetadata()
            metadata.from_dict(dataset_dict)
            dataset_qnode, _ = DatasetMetadataResource.create_dataset(metadata)

        s = time()
        validation_report, valid_annotated_file, rename_columns = self.va.validate(
            dataset, df=df)
        print(f'time take to validate annotated file: {time() - s} seconds')
        if validate:
            if not valid_annotated_file:
                return json.loads(validation_report), 400

        if files_only:
            t2wml_yaml, combined_item_def_df, consolidated_wikifier_df = self.ta.process(
                dataset_qnode, df, rename_columns, extra_files=True)

            temp_tar_dir = tempfile.mkdtemp()
            open(f'{temp_tar_dir}/t2wml.yaml', 'w').write(t2wml_yaml)
            combined_item_def_df.to_csv(
                f'{temp_tar_dir}/item_definitions_all.tsv',
                sep='\t',
                index=False)
            consolidated_wikifier_df.to_csv(
                f'{temp_tar_dir}/consolidated_wikifier.csv', index=False)

            with tarfile.open(f'{temp_tar_dir}/t2wml_annotation_files.tar.gz',
                              "w:gz") as tar:
                tar.add(temp_tar_dir, arcname='.')
            return send_from_directory(temp_tar_dir,
                                       't2wml_annotation_files.tar.gz')

        else:
            s = time()
            variable_ids, kgtk_exploded_df = self.ta.process(
                dataset_qnode, df, rename_columns)
            print(f'time take to create kgtk files: {time() - s} seconds')

            if is_request_put:
                # delete the variable canonical data and metadata before inserting into databse again!!
                for v in variable_ids:
                    print(self.vd.delete(dataset, v))
                    print(self.vmr.delete(dataset, v))

            # import to database
            s = time()
            print('number of rows to be imported: {}'.format(
                len(kgtk_exploded_df)))
            try:
                import_kgtk_dataframe(kgtk_exploded_df, is_file_exploded=True)
            except Exception as e:
                # Not sure what's going on here, so print for debugging purposes
                print("Can't import exploded kgtk file")
                traceback.print_exc(file=sys.stdout)
                raise e
            print(
                f'time take to import kgtk file into database: {time() - s} seconds'
            )

            variables_metadata = []
            for v in variable_ids:
                variables_metadata.append(self.vmr.get(dataset, variable=v)[0])
            print(f'total time taken: {time() - l}')
            return variables_metadata, 201
コード例 #7
0
    def canonical_data(self, dataset, variable, is_request_put=True):
        wikify = request.args.get('wikify', 'false').lower() == 'true'

        # check if the dataset exists
        dataset_id = dal.get_dataset_id(dataset)

        if not dataset_id:
            return {'Error': 'Dataset not found: {}'.format(dataset)}, 404

        # check if variable exists for the dataset
        # P2006020003 - Variable Measured
        # P1687 - Corresponds to property

        # noinspection SqlNoDataSourceInspection
        variable_query = f"""Select e.node1, e.node2 from edges e where e.node1 in (
                                select e_variable.node1 from edges e_variable
                                        where e_variable.node1 in
                                    (
                                        select e_dataset.node2 from edges e_dataset
                                        where e_dataset.node1 = '{dataset_id}'
                                        and e_dataset.label = 'P2006020003'
                                    )
                                    and e_variable.label = 'P1813' and e_variable.node2 = '{variable}'
                                )
                                and e.label = 'P1687'
                                    """

        variable_result = query_to_dicts(variable_query)
        if len(variable_result) == 0:
            return {
                'error':
                'Variable: {} not found for the dataset: {}'.format(
                    variable, dataset)
            }, 404

        variable_pnode = variable_result[0]['node2']
        variable_qnode = variable_result[0]['node1']

        kgtk_format_list = list()

        # dataset and variable has been found, wikify and upload the data
        df = pd.read_csv(request.files['file'], dtype=object).fillna('')
        column_map = {}
        _d_columns = list(df.columns)
        for c in _d_columns:
            column_map[c] = c.lower().strip()

        df = df.rename(columns=column_map)

        d_columns = list(df.columns)

        unsanitized_qualifier_columns = [
            x for x in d_columns if x not in self.non_qualifier_columns
            and x not in self.required_fields
        ]
        s_qualifier_columns = {}
        for qc in unsanitized_qualifier_columns:
            s_qualifier_columns[qc] = sanitize(qc)

        df = df.rename(columns=s_qualifier_columns)
        d_columns = list(df.columns)

        qualifier_columns = [
            s_qualifier_columns[k] for k in s_qualifier_columns
        ]
        qualifer_dict = {}
        if qualifier_columns:
            # extra columns in the file, qualifier time
            # first see if any qualifiers already exist
            qualifer_dict = self.get_qualifiers(
                variable_qnode, qualifier_labels=qualifier_columns)
            qualifier_to_be_created = [
                x for x in qualifier_columns if x not in qualifer_dict
            ]
            qualifier_edges, new_q_dict = self.create_qualifier_edges(
                qualifier_to_be_created, variable_qnode)
            kgtk_format_list.extend(qualifier_edges)
            qualifer_dict.update(new_q_dict)

        existing_qualifiers = self.get_qualifiers(variable_qnode)
        extra_qualifiers = []
        if 'P585' not in existing_qualifiers:
            extra_qualifiers.append('P585')
        if 'P248' not in existing_qualifiers:
            extra_qualifiers.append('P248')
        extra_qualifier_edges, _ = self.create_qualifier_edges(
            extra_qualifiers, variable_qnode)
        kgtk_format_list.extend(extra_qualifier_edges)

        # validate file headers first
        validator_header_log, valid_file = self.validate_headers(df)
        if not valid_file:
            return validator_header_log, 400

        countries = list(df['country'].unique())

        if 'main_subject_id' not in d_columns or wikify:
            main_subjects = list(df['main_subject'].unique())
            main_subjects_wikified = self.country_wikifier.wikify(
                main_subjects)
            all_invalid = all([
                main_subjects_wikified[k] is None
                for k in main_subjects_wikified
            ])
            _country = df['country']
            if all_invalid and countries[0].strip().lower() == 'ethiopia':
                # must be Ethiopia regions
                df = self.ethiopia_wikifier.produce(
                    input_df=df,
                    target_column='main_subject',
                    output_column_name='main_subject_id')

            else:
                df['main_subject_id'] = df['main_subject'].map(
                    lambda x: main_subjects_wikified[x])

        if 'country_id' not in d_columns or wikify:
            countries_wikified = self.country_wikifier.wikify(countries)
            df['country_id'] = df['country'].map(
                lambda x: countries_wikified[x])

        # validate file contents
        validator_file_log, valid_file = self.validate_input_file(
            df, dataset, variable)
        if not valid_file:
            return validator_file_log, 400

        if 'value_unit' in d_columns and ('value_unit_id' not in d_columns
                                          or wikify):
            units = list(df['value_unit'].unique())

            # noinspection SqlNoDataSourceInspection
            units_query = "SELECT e.node1, e.node2 FROM edges e WHERE e.node2 in ({}) and e.label = 'label'".format(
                self.format_sql_string(units))

            units_results = query_to_dicts(units_query)

            unit_qnode_dict = {}

            for ur in units_results:
                unit_qnode_dict[ur['node2']] = ur['node1']

            no_qnode_units = list()
            no_qnode_units.extend(
                [u for u in units if u not in unit_qnode_dict])

            no_unit_qnode_dict = self.create_new_qnodes(
                dataset_id, no_qnode_units, 'Unit')

            df['value_unit_id'] = df['value_unit'].map(
                lambda x: unit_qnode_dict[x]
                if x in unit_qnode_dict else no_unit_qnode_dict[x])

            # create rows for new created variables Unit Qnodes and Source Qnodes
            for k in no_unit_qnode_dict:
                _q = no_unit_qnode_dict[k]
                kgtk_format_list.append(
                    self.create_triple(_q, 'label', json.dumps(k)))
                kgtk_format_list.append(self.create_triple(
                    _q, 'P31', 'Q47574'))  # unit of measurement

        if 'source' in d_columns and ('source_id' not in d_columns or wikify):
            sources = list(df['source'].unique())

            # noinspection SqlNoDataSourceInspection
            sources_query = "SELECT  e.node1, e.node2 FROM edges e WHERE e.label = 'label' and e.node2 in  ({})".format(
                self.format_sql_string(sources))

            sources_results = query_to_dicts(sources_query)

            source_qnode_dict = {}
            for sr in sources_results:
                source_qnode_dict[sr['node2']] = sr['node1']

            no_qnode_sources = list()
            no_qnode_sources.extend(
                [s for s in sources if s not in source_qnode_dict])

            no_source_qnode_dict = self.create_new_qnodes(
                dataset_id, no_qnode_sources, 'Source')

            df['source_id'] = df['source'].map(lambda x: source_qnode_dict[
                x] if x in source_qnode_dict else no_source_qnode_dict[x])
            for k in no_source_qnode_dict:
                kgtk_format_list.append(
                    self.create_triple(no_source_qnode_dict[k], 'label',
                                       json.dumps(k)))

        for i, row in df.iterrows():
            kgtk_format_list.extend(
                self.create_kgtk_measurements(row, dataset_id, variable_pnode,
                                              qualifer_dict))

        if is_request_put:
            # this is a PUT request, delete all data for this variable and upload the current data
            self.vd.delete(dataset, variable)

        df_kgtk = pd.DataFrame(kgtk_format_list)
        import_kgtk_dataframe(df_kgtk)

        return '{} rows imported!'.format(len(df)), 201  # original file