Exemple #1
0
    def __init__(self,
                 source_path,
                 cdescontroller,
                 sample_rows,
                 maxlevels,
                 na_empty_strings_only=False):

        sourcedb = CsvDB('hospitaldb', [source_path], schematype='source')
        # get user home directory
        self.__homepath = os.getenv('HOME')
        # create mapping folder
        self.__mappingpath = os.path.join(self.__homepath, '.mipqctool',
                                          'mapping')
        if not os.path.isdir(self.__mappingpath):
            os.makedirs(self.__mappingpath)
        # create the target mapping folder and xml folder
        if not os.path.isdir(os.path.join(self.__mappingpath, 'target')):
            os.mkdir(os.path.join(self.__mappingpath, 'target'))
        if not os.path.isdir(os.path.join(self.__mappingpath, 'xml')):
            os.mkdir(os.path.join(self.__mappingpath, 'xml'))
        # get the cde dataset name
        self.__target_dbname = cdescontroller.cdedataset_name
        # use it also as filename by adding .csv extension
        self.__target_filename = self.__target_dbname + '.csv'
        #
        # this will be used in the mapping execution by mipmap engine
        self.__target_folder = os.path.join(self.__mappingpath, 'target')
        self.__target_path = os.path.join(self.__target_folder,
                                          self.__target_filename)
        # create a csv file with the cde headers only
        cdescontroller.save_csv_headers_only(self.__target_path)
        self.__cdecontroller = cdescontroller
        # now we can create the CsvDB for the target schema
        targetdb = CsvDB(self.__target_dbname, [self.__target_path],
                         schematype='target')
        # create the Mapping object
        self.__mapping = Mapping(sourcedb, targetdb)
        # store the QcSchema for the cde datset
        self.__cde_schema = cdescontroller.dataset_schema
        # With QcTable we can access medata about the source csv
        self.__srctbl = QcTable(source_path, schema=None)
        # inder the table schema
        self.__srctbl.infer(limit=sample_rows,
                            maxlevels=maxlevels,
                            na_empty_strings_only=na_empty_strings_only)
        self.__src_path = source_path
        self.__src_folder = os.path.dirname(source_path)
        # create table report for the source file
        self.__tblreport = TableReport(self.__srctbl)
        self.__src_filename = self.__srctbl.filename
        self.__src_headers = self.__srctbl.headers4mipmap
        srcname_no_ext = os.path.splitext(self.__src_path)[0]
        reportfilepath = srcname_no_ext + '_report.xlsx'
        self.__tblreport.printexcel(reportfilepath)
        # get the cde headers
        self.__cde_headers = cdescontroller.cde_headers
        self.__cde_mapped = self.__mapping.correspondences.keys()
        self.__cde_not_mapped = self.__cde_headers
        # get source vars for each cde correspondence
        self.__cde_corrs_sources = []
Exemple #2
0
def test_dublicates(path, schema, primary_keys, dublicates):
    schema['primaryKey'] = primary_keys
    profiler = TableProfiler(schema)
    table = QcTable(path, schema=None)
    rows = table.read(cast=False)
    profiler.validate(rows, table.headers)
    assert profiler.rows_with_dublicates == dublicates
def test_infer_schema_empty_file():
    s = QcTable(EMPTY_FILEPATH, schema=None)
    d = s.infer()
    assert d == {
        'fields': [],
        'missingValues': [''],
    }
def test_schema_infer_storage(import_module, apply_defaults):
    import_module.return_value = Mock(Storage=Mock(return_value=Mock(
        describe=Mock(return_value=SCHEMA_MIN),
        iter=Mock(return_value=DATA_MIN[1:]),
    )))
    table = QcTable('table', schema=None, storage='storage')
    table.infer()
    assert table.headers == ['key', 'value']
    assert table.schema.descriptor == apply_defaults(SCHEMA_MIN)
def test_column_values(path, column_name):
    with open(path) as csvfile:
        reader = csv.reader(csvfile)
        # read header
        headers = next(reader)
        index = headers.index(column_name)
        result = [row[index] for row in reader]
        table = QcTable(path, schema=None)
        table.infer()
        assert table.column_values(column_name) == result
Exemple #6
0
def test_missing(path, schema, primary_keys, missing_pk, missing_rq):
    schema['primaryKey'] = primary_keys
    schema['missingValues'].append('NA')
    schema['fields'][2]['constraints'] = {'required': True}
    profiler = TableProfiler(schema)
    table = QcTable(path, schema=None)
    rows = table.read(cast=False)
    profiler.validate(rows, table.headers)
    assert profiler.rows_with_missing_pk == missing_pk
    assert profiler.rows_with_missing_required == missing_rq
Exemple #7
0
 def from_disc(cls,
               csvpath,
               dict_schema,
               schema_type='qc',
               id_column=1,
               threshold=3):
     """
     Constucts a TableReport from a csvfile and a given schema.
     Arguments:
     :param csvpath: string, the csv filepath
     :param schema: dictionary describing the csv schema
     :param schema_type: 'qc' for frictionless type, 'dc' for Data Catalogue type json schema
     :param id_column: column number of dataset's primary key (id)
     :param threshold: outlier threshold - (mean - threshold * std, mean + threshold * std) 
                       outside this length, a numerical value is considered outlier
     """
     if schema_type == 'qc':
         dataset_schema = QcSchema(dict_schema)
     elif schema_type == 'dc':
         LOGGER.info(
             'Transating from Data Catalogue to Frictionless json format...'
         )
         qcdict_schema = FrictionlessFromDC(dict_schema).qcdescriptor
         dataset_schema = QcSchema(qcdict_schema)
     dataset = QcTable(csvpath, schema=dataset_schema)
     return cls(dataset, id_column=id_column, threshold=threshold)
def test_valid_rows_stats(datasetpath, schemapath, id_column, result):
    with open(schemapath) as json_file:
        dict_schema = json.load(json_file)
    schema = QcSchema(dict_schema)
    testtable = QcTable(datasetpath, schema=schema)
    testreport = TableReport(testtable, id_column)
    with pytest.warns(None) as recorded:
        assert testreport.valid_rows_stats == result
        assert recorded.list == []
def test_corrected_filled_rows_stats(datasetpath, schemapath, id_column, result):
    with open(schemapath) as json_file:
        dict_schema = json.load(json_file)
    schema = QcSchema(dict_schema)
    testtable = QcTable(datasetpath, schema=schema)
    testreport = TableReport(testtable, id_column)
    testreport.printpdf(os.path.join(APP_PATH, 'test_datasets/dataset_report.pdf'))
    testreport.printexcel(os.path.join(APP_PATH, 'test_datasets/dataset_report.xlsx'))
    testreport.apply_corrections()
    testreport.printpdf(os.path.join(APP_PATH, 'test_datasets/dataset_report_after.pdf'))
    testreport.save_corrected(os.path.join(APP_PATH, 'test_datasets/corrected.csv'))
    with pytest.warns(None) as recorded:
        assert testreport.filled_rows_stats == result
        assert recorded.list == []
Exemple #10
0
def csv(input_csv, schema_json, clean, metadata, report, outlier):
    """This command produces a validation report for <csv file>.

    The report file is stored in the same folder where <csv file> is located.
    
    <schema json> file MUST be compliant with frirctionless
      data table-schema specs(https://specs.frictionlessdata.io/table-schema/) or
      with Data Catalogue json format.
    """
    filename = os.path.basename(input_csv)
    # Get the path of the csv file
    path = os.path.dirname(os.path.abspath(input_csv))

    dataset_name = os.path.splitext(filename)[0]
    pdfreportfile = os.path.join(path, dataset_name + '_report.pdf')
    xlsxreportfile = os.path.join(path, dataset_name + '_report.xlsx')
    correctedcsvfile = os.path.join(path, dataset_name + '_corrected.csv')

    # read the json file with the csv schema
    with open(schema_json) as json_file:
        dict_schema = json.load(json_file)

    # check metadata json type is Data Catalogue specs
    if metadata == 'dc':
        LOGGER.info(
            'Transating from Data Catalogue to Frictionless json format...')
        dict_schema = FrictionlessFromDC(dict_schema).qcdescriptor

    schema = QcSchema(dict_schema)
    dataset = QcTable(input_csv, schema=schema)

    datasetreport = TableReport(dataset, threshold=outlier)

    # Apply data cleaning corrections?
    if clean:
        datasetreport.apply_corrections()
        datasetreport.save_corrected(correctedcsvfile)

    if datasetreport.isvalid:
        LOGGER.info('The dataset has is valid.')
    else:
        LOGGER.info('CAUTION! The dataset is invalid!')

    # export the report
    if report == 'pdf':
        datasetreport.printpdf(pdfreportfile)
    elif report == 'xls':
        datasetreport.printexcel(xlsxreportfile)
Exemple #11
0
    def __init__(self, dbname, filepaths, schematype='source'):
        """"
        Arguments:
        :param dbname: tha name of the database
        :param tables: list of QcTable objects
        :param schematype: 'source' or 'target'
        """
        self.__dbname = dbname
        self.__dbtype = 'CSV'
        self.__schematype = schematype

        tables = [QcTable(fpath, schema=None) for fpath in filepaths]

        # store QcTable objects in a dictionary with filename as key
        self.__tables = {table.filename: table for table in tables}
        # dublications
        self.__dublications = {}  #{table.filename: int}
        self.__xml_elements = None
        self.__create_xml_element()
Exemple #12
0
def infercsv(input_csv,
             schema_spec,
             sample_rows,
             max_levels,
             threshold,
             cde_file=None):
    """This command infers the schema of the <csv file> it and stored in <output file>.

    The <output file> either a json file following the frictionless data specs(https://specs.frictionlessdata.io/table-schema/)
    or an xlsx file following MIP Data Catalogue's format.
    
    """
    filename = os.path.basename(input_csv)
    # Get the path of the csv file
    path = os.path.dirname(os.path.abspath(input_csv))

    dataset_name = os.path.splitext(filename)[0]
    qcjsonfile = os.path.join(path, dataset_name + '_qcschema.json')
    dcxlsxfile = os.path.join(path, dataset_name + '_dcschema.xlsx')

    dataset = QcTable(input_csv, schema=None)
    # Is cde dictionary file available?
    if cde_file:
        cde_dict = CdeDict(cde_file)
    else:
        cde_dict = None

    infer = InferSchema(dataset, dataset_name, sample_rows, max_levels,
                        cde_dict)

    # suggest cdes and concept paths if cde dictionary is available
    if cde_dict:
        infer.suggest_cdes(threshold=threshold)

    if schema_spec == 'dc':
        infer.export2excel(dcxlsxfile)
    elif schema_spec == 'qc':
        infer.expoct2qcjson(qcjsonfile)
def test_mipmapheaders(filepath, result):
    test = QcTable(filepath, schema=None)
    assert test.headers4mipmap == result
def test_schema_infer_tabulator(path, schema):
    table = QcTable(path, schema=None)
    table.infer(maxlevels=3)
    assert table.headers == ['id', 'name', 'diagnosis']
    assert table.schema.descriptor == schema
Exemple #15
0
def test_invalid_rows():
    profiler = TableProfiler(SCHEMA_SIMPLE)
    table = QcTable('tests/test_datasets/simple_invalid_2.csv', schema=None)
    rows = table.read(cast=False)
    assert profiler.validate(rows, table.headers) == False
    assert profiler.invalid_rows == [5]
Exemple #16
0
def test_vadidate_rows_with_invalids(path, schema, valid, rows_with_invalids):
    profiler = TableProfiler(schema)
    table = QcTable(path, schema=None)
    rows = table.read(cast=False)
    assert profiler.validate(rows, table.headers) == valid
    assert profiler.rows_with_invalids == rows_with_invalids
def test_column_values_exception(path, column_name):
    table = QcTable(path, schema=None)
    table.infer()
    with pytest.raises(QCToolException):
        assert table.column_values(column_name)
Exemple #18
0
class MipCDEMapper(object):
    """Class for handling a simple (one to one) mapping task 
    and creating  the mapping xml for mipmap engine. 
    
    :Arguments:
    :source path: the filepath of the source csv
    :cdescontroller: a CDEsController object containing info about the target CDE dataset
    :param sample_rows: number of sample rows for schema inferance of
                        source csv
    :param maxlevels: total unique string values in a column to be 
                      considered as categorical type in schema inference 
                      of the source csv file. 
    :param na_empty_strings_only: (boolean) If True, only the empty strings
                                  will be infered as NAs

    """
    def __init__(self,
                 source_path,
                 cdescontroller,
                 sample_rows,
                 maxlevels,
                 na_empty_strings_only=False):

        sourcedb = CsvDB('hospitaldb', [source_path], schematype='source')
        # get user home directory
        self.__homepath = os.getenv('HOME')
        # create mapping folder
        self.__mappingpath = os.path.join(self.__homepath, '.mipqctool',
                                          'mapping')
        if not os.path.isdir(self.__mappingpath):
            os.makedirs(self.__mappingpath)
        # create the target mapping folder and xml folder
        if not os.path.isdir(os.path.join(self.__mappingpath, 'target')):
            os.mkdir(os.path.join(self.__mappingpath, 'target'))
        if not os.path.isdir(os.path.join(self.__mappingpath, 'xml')):
            os.mkdir(os.path.join(self.__mappingpath, 'xml'))
        # get the cde dataset name
        self.__target_dbname = cdescontroller.cdedataset_name
        # use it also as filename by adding .csv extension
        self.__target_filename = self.__target_dbname + '.csv'
        #
        # this will be used in the mapping execution by mipmap engine
        self.__target_folder = os.path.join(self.__mappingpath, 'target')
        self.__target_path = os.path.join(self.__target_folder,
                                          self.__target_filename)
        # create a csv file with the cde headers only
        cdescontroller.save_csv_headers_only(self.__target_path)
        self.__cdecontroller = cdescontroller
        # now we can create the CsvDB for the target schema
        targetdb = CsvDB(self.__target_dbname, [self.__target_path],
                         schematype='target')
        # create the Mapping object
        self.__mapping = Mapping(sourcedb, targetdb)
        # store the QcSchema for the cde datset
        self.__cde_schema = cdescontroller.dataset_schema
        # With QcTable we can access medata about the source csv
        self.__srctbl = QcTable(source_path, schema=None)
        # inder the table schema
        self.__srctbl.infer(limit=sample_rows,
                            maxlevels=maxlevels,
                            na_empty_strings_only=na_empty_strings_only)
        self.__src_path = source_path
        self.__src_folder = os.path.dirname(source_path)
        # create table report for the source file
        self.__tblreport = TableReport(self.__srctbl)
        self.__src_filename = self.__srctbl.filename
        self.__src_headers = self.__srctbl.headers4mipmap
        srcname_no_ext = os.path.splitext(self.__src_path)[0]
        reportfilepath = srcname_no_ext + '_report.xlsx'
        self.__tblreport.printexcel(reportfilepath)
        # get the cde headers
        self.__cde_headers = cdescontroller.cde_headers
        self.__cde_mapped = self.__mapping.correspondences.keys()
        self.__cde_not_mapped = self.__cde_headers
        # get source vars for each cde correspondence
        self.__cde_corrs_sources = []

    @property
    def sourcereport(self):
        return self.__tblreport

    @property
    def source_filename(self):
        return self.__src_filename

    @property
    def source_headers(self):
        return self.__src_headers

    @property
    def corr_sources(self):
        """source vars for each cde correspondence"""
        return self.__cde_corrs_sources

    @property
    def cde_filename(self):
        return self.__target_filename

    @property
    def cde_mapped(self):
        return self.__cde_mapped

    @property
    def cde_not_mapped(self):
        return self.__cde_not_mapped

    @property
    def cdecontroller(self):
        return self.__cdecontroller

    def suggest_corr(self, cdedict, threshold):
        """
        Arguments:
        :param cdedict: CdeDict object
        :param threshold: 0-1 similarity threshold, below that not a cde is suggested
        """
        cde_sugg_dict = {}  # {cdecode:sourcecolumn}
        source_table = self.__srctbl.filename
        target_table = self.__target_filename
        sugg_replacemnts = {
        }  # here will be stored the suggestions replacments {cdecode:[Replacemsnts]}
        #source_raw_headers = self.__mapping.sourcedb.get_raw_table_headers(source_table)

        # for each source column
        for name, columnreport in self.__tblreport.columnreports.items():

            cde = cdedict.suggest_cde(columnreport, threshold=threshold)
            # check if a cde mapping already exist
            if cde and (cde.code not in cde_sugg_dict.keys()):
                cde_sugg_dict[
                    cde.code] = self.__mapping.sourcedb.raw_2_mipmap_header(
                        self.__src_filename, columnreport.name)
                # suggest category replacements for cases where source col and cde are nominal
                sugg_reps = cdedict.suggest_replecements(cde.code,
                                                         columnreport,
                                                         threshold=threshold)
                if sugg_reps:
                    sugg_replacemnts[cde.code] = sugg_reps
        for cdecode, source_var in cde_sugg_dict.items():
            source_paths = [(source_table, source_var, None)]
            target_path = (target_table, cdecode, None)
            filename_column = '.'.join(
                [os.path.splitext(source_table)[0], source_var])
            # lets see if this cde have value replacements suggestions, if so create the if statment
            if cdecode in sugg_replacemnts.keys():
                expression = ifstr(filename_column, sugg_replacemnts[cdecode])
            else:
                expression = filename_column

            # let's try to create the correspondence now
            try:
                self.__mapping.add_corr(
                    source_paths=source_paths,
                    target_path=target_path,
                    expression=expression,
                    replacements=sugg_replacemnts.get(cdecode))
            # If a cde correspondance already exists then pass
            except MappingError:
                LOGGER.warning(
                    'found cde macth for source column "{}" but cde "{}" \
                               is not included in the selected cde pathology.'.
                    format(source_var, cdecode))

        self.__update_cde_mapped()

    def add_corr(self, cde, source_cols, expression):
        source_paths = [(self.__srctbl.filename, col, None)
                        for col in source_cols]
        target_path = (self.__target_filename, cde, None)
        self.__mapping.add_corr(source_paths, target_path, expression)
        self.__update_cde_mapped()

    def remove_corr(self, cde):
        self.__mapping.remove_corr(cde)
        self.__update_cde_mapped()

    def update_corr(self, cde, source_cols, expression):
        source_paths = [(self.__srctbl.filename, col, None)
                        for col in source_cols]
        target_path = (self.__target_filename, cde, None)
        self.__mapping.update_corr(source_paths, target_path, expression)
        self.__update_cde_mapped()

    def get_corr_expression(self, cde):
        return self.__mapping.correspondences[cde].expression

    def get_corr_replacements(self, cde):
        return self.__mapping.correspondences[cde].replacements

    def get_col_stats(self, mipmap_column) -> dict:
        """returns source columns stats.
        Arguments:
        :param mipmap_column: mipmap tranformed column name
        """
        stats = {}
        raw_headers = self.__mapping.sourcedb.get_raw_table_headers(
            self.source_filename)
        # convert the column that is mipmap formated to the initial column name
        col = raw_headers[mipmap_column]
        col_report = self.__tblreport.columnreports[col]
        if col_report:
            stats['miptype'] = col_report.miptype
            stats['value_range'] = col_report.value_range
            if col_report.miptype in ['numerical', 'integer']:
                stats['mean'] = col_report.stats['mean']
                stats['std'] = col_report.stats['std']
            return stats
        else:
            return None

    def get_cde_info(self, mipmap_cde) -> dict:
        """returns cde type and values.
        Arguments:
        :param mipmap_cde: mipmap tranformed cde name
        """
        cde_info = {}
        raw_cde_dict = self.__mapping.targetdb.get_raw_table_headers(
            self.cde_filename)
        raw_cde = raw_cde_dict[mipmap_cde]
        cde_schema = self.__cde_schema
        cdefield = cde_schema.get_field(raw_cde)
        cde_type = cdefield.miptype
        constraints = cdefield.constraints
        if cde_type == 'nominal' and constraints:
            con = constraints.get('enum')
        elif cde_type in ['numerical', 'integer'] and constraints:
            con = [constraints.get('minimum'), constraints.get('maximum')]
        else:
            con = None

        cde_info = {'miptype': cde_type, 'constraints': con}
        return cde_info

    def get_source_raw_header(self, mipmap_col):
        raw_headers = self.__mapping.sourcedb.get_raw_table_headers(
            self.source_filename)
        return raw_headers[mipmap_col]

    def get_cde_raw_header(self, mipmap_col):
        raw_headers = self.__mapping.targetdb.get_raw_table_headers(
            self.__target_filename)
        return raw_headers[mipmap_col]

    def get_cde_mipmap_header(self, raw_cde_header):
        raw_headers = self.__mapping.targetdb.get_raw_table_headers(
            self.__target_filename)
        mipmap_headers = {value: key for key, value in raw_headers.items()}
        return mipmap_headers[raw_cde_header]

    def run_mapping(self, output):
        xml_folder = os.path.join(self.__mappingpath, 'xml')
        xml_path = os.path.join(xml_folder, 'map.xml')
        with open(xml_path, 'w') as mapxml:
            mapxml.write(self.__mapping.xml_string)
        DockerMipmap(xml_folder, self.__src_folder, self.__target_folder,
                     output)

    def save_mapping(self, filepath):
        with open(filepath, 'w') as mapxml:
            mapxml.write(self.__mapping.xml_string)

    def replace_function(self, column, replacments):
        """Returns a mipmap function string with encapsulated if statements 
        for replacing given values of a column with predefined ones. 
        This is used in a categorical/nominal column type

        Arguments:
        :param columnname: the column name(str)
        :param repls: list with Replacement namedtuples
                    Replacement('source', 'target')   
        """
        return ifstr(column, replacments)

    def __update_cde_mapped(self):
        self.__cde_mapped = list(self.__mapping.correspondences.keys())
        cde_not_mapped = self.__cde_headers.copy()
        cde_corrs_sources = OrderedDict()
        for cde in self.__cde_mapped:
            cde_not_mapped.remove(cde)
            source_paths = self.__mapping.correspondences[cde].source_paths
            pathstring = ', '.join([path[1] for path in source_paths])
            cde_corrs_sources[cde] = pathstring

        self.__cde_corrs_sources = cde_corrs_sources
        self.__cde_not_mapped = cde_not_mapped
def test_actual_headers(path, result):
    table = QcTable(path, schema=None)
    assert table.actual_headers == result
    'filled_25_49': 1,
    'filled_0_24': 0
}


F_ROWS_PER_COLUMN1 = {
   4: 14,
   3: 4,
   2: 1,
   1: 1
}


# these 3 lines is for creating the initial
# metadata json file with the dataset's schema
TEST_TABLE = QcTable(DATASET2_PATH, schema=None)
TEST_TABLE.infer()
TEST_TABLE.schema.save(os.path.join(APP_PATH, 'test_datasets/test_dataset2.json'))


@pytest.mark.parametrize('datasetpath, schemapath, result, total_rows', [
    (DATASET1_PATH, METADATA1_PATH, F_ROWS_PER_COLUMN1, 20)
])
def test_calc_rows_per_column(datasetpath, schemapath, result, total_rows):
    with open(schemapath) as json_file:
        dict_schema = json.load(json_file)
    testreport = TableReport.from_disc(datasetpath, dict_schema)
    with pytest.warns(None) as recorded:
        assert testreport.total_rows == total_rows
        assert testreport._TableReport__tfilled_columns == result
        assert recorded.list == []