def __init__(self, source_path, cdescontroller, sample_rows, maxlevels, na_empty_strings_only=False): sourcedb = CsvDB('hospitaldb', [source_path], schematype='source') # get user home directory self.__homepath = os.getenv('HOME') # create mapping folder self.__mappingpath = os.path.join(self.__homepath, '.mipqctool', 'mapping') if not os.path.isdir(self.__mappingpath): os.makedirs(self.__mappingpath) # create the target mapping folder and xml folder if not os.path.isdir(os.path.join(self.__mappingpath, 'target')): os.mkdir(os.path.join(self.__mappingpath, 'target')) if not os.path.isdir(os.path.join(self.__mappingpath, 'xml')): os.mkdir(os.path.join(self.__mappingpath, 'xml')) # get the cde dataset name self.__target_dbname = cdescontroller.cdedataset_name # use it also as filename by adding .csv extension self.__target_filename = self.__target_dbname + '.csv' # # this will be used in the mapping execution by mipmap engine self.__target_folder = os.path.join(self.__mappingpath, 'target') self.__target_path = os.path.join(self.__target_folder, self.__target_filename) # create a csv file with the cde headers only cdescontroller.save_csv_headers_only(self.__target_path) self.__cdecontroller = cdescontroller # now we can create the CsvDB for the target schema targetdb = CsvDB(self.__target_dbname, [self.__target_path], schematype='target') # create the Mapping object self.__mapping = Mapping(sourcedb, targetdb) # store the QcSchema for the cde datset self.__cde_schema = cdescontroller.dataset_schema # With QcTable we can access medata about the source csv self.__srctbl = QcTable(source_path, schema=None) # inder the table schema self.__srctbl.infer(limit=sample_rows, maxlevels=maxlevels, na_empty_strings_only=na_empty_strings_only) self.__src_path = source_path self.__src_folder = os.path.dirname(source_path) # create table report for the source file self.__tblreport = TableReport(self.__srctbl) self.__src_filename = self.__srctbl.filename self.__src_headers = self.__srctbl.headers4mipmap srcname_no_ext = os.path.splitext(self.__src_path)[0] reportfilepath = srcname_no_ext + '_report.xlsx' self.__tblreport.printexcel(reportfilepath) # get the cde headers self.__cde_headers = cdescontroller.cde_headers self.__cde_mapped = self.__mapping.correspondences.keys() self.__cde_not_mapped = self.__cde_headers # get source vars for each cde correspondence self.__cde_corrs_sources = []
def test_dublicates(path, schema, primary_keys, dublicates): schema['primaryKey'] = primary_keys profiler = TableProfiler(schema) table = QcTable(path, schema=None) rows = table.read(cast=False) profiler.validate(rows, table.headers) assert profiler.rows_with_dublicates == dublicates
def test_infer_schema_empty_file(): s = QcTable(EMPTY_FILEPATH, schema=None) d = s.infer() assert d == { 'fields': [], 'missingValues': [''], }
def test_schema_infer_storage(import_module, apply_defaults): import_module.return_value = Mock(Storage=Mock(return_value=Mock( describe=Mock(return_value=SCHEMA_MIN), iter=Mock(return_value=DATA_MIN[1:]), ))) table = QcTable('table', schema=None, storage='storage') table.infer() assert table.headers == ['key', 'value'] assert table.schema.descriptor == apply_defaults(SCHEMA_MIN)
def test_column_values(path, column_name): with open(path) as csvfile: reader = csv.reader(csvfile) # read header headers = next(reader) index = headers.index(column_name) result = [row[index] for row in reader] table = QcTable(path, schema=None) table.infer() assert table.column_values(column_name) == result
def test_missing(path, schema, primary_keys, missing_pk, missing_rq): schema['primaryKey'] = primary_keys schema['missingValues'].append('NA') schema['fields'][2]['constraints'] = {'required': True} profiler = TableProfiler(schema) table = QcTable(path, schema=None) rows = table.read(cast=False) profiler.validate(rows, table.headers) assert profiler.rows_with_missing_pk == missing_pk assert profiler.rows_with_missing_required == missing_rq
def from_disc(cls, csvpath, dict_schema, schema_type='qc', id_column=1, threshold=3): """ Constucts a TableReport from a csvfile and a given schema. Arguments: :param csvpath: string, the csv filepath :param schema: dictionary describing the csv schema :param schema_type: 'qc' for frictionless type, 'dc' for Data Catalogue type json schema :param id_column: column number of dataset's primary key (id) :param threshold: outlier threshold - (mean - threshold * std, mean + threshold * std) outside this length, a numerical value is considered outlier """ if schema_type == 'qc': dataset_schema = QcSchema(dict_schema) elif schema_type == 'dc': LOGGER.info( 'Transating from Data Catalogue to Frictionless json format...' ) qcdict_schema = FrictionlessFromDC(dict_schema).qcdescriptor dataset_schema = QcSchema(qcdict_schema) dataset = QcTable(csvpath, schema=dataset_schema) return cls(dataset, id_column=id_column, threshold=threshold)
def test_valid_rows_stats(datasetpath, schemapath, id_column, result): with open(schemapath) as json_file: dict_schema = json.load(json_file) schema = QcSchema(dict_schema) testtable = QcTable(datasetpath, schema=schema) testreport = TableReport(testtable, id_column) with pytest.warns(None) as recorded: assert testreport.valid_rows_stats == result assert recorded.list == []
def test_corrected_filled_rows_stats(datasetpath, schemapath, id_column, result): with open(schemapath) as json_file: dict_schema = json.load(json_file) schema = QcSchema(dict_schema) testtable = QcTable(datasetpath, schema=schema) testreport = TableReport(testtable, id_column) testreport.printpdf(os.path.join(APP_PATH, 'test_datasets/dataset_report.pdf')) testreport.printexcel(os.path.join(APP_PATH, 'test_datasets/dataset_report.xlsx')) testreport.apply_corrections() testreport.printpdf(os.path.join(APP_PATH, 'test_datasets/dataset_report_after.pdf')) testreport.save_corrected(os.path.join(APP_PATH, 'test_datasets/corrected.csv')) with pytest.warns(None) as recorded: assert testreport.filled_rows_stats == result assert recorded.list == []
def csv(input_csv, schema_json, clean, metadata, report, outlier): """This command produces a validation report for <csv file>. The report file is stored in the same folder where <csv file> is located. <schema json> file MUST be compliant with frirctionless data table-schema specs(https://specs.frictionlessdata.io/table-schema/) or with Data Catalogue json format. """ filename = os.path.basename(input_csv) # Get the path of the csv file path = os.path.dirname(os.path.abspath(input_csv)) dataset_name = os.path.splitext(filename)[0] pdfreportfile = os.path.join(path, dataset_name + '_report.pdf') xlsxreportfile = os.path.join(path, dataset_name + '_report.xlsx') correctedcsvfile = os.path.join(path, dataset_name + '_corrected.csv') # read the json file with the csv schema with open(schema_json) as json_file: dict_schema = json.load(json_file) # check metadata json type is Data Catalogue specs if metadata == 'dc': LOGGER.info( 'Transating from Data Catalogue to Frictionless json format...') dict_schema = FrictionlessFromDC(dict_schema).qcdescriptor schema = QcSchema(dict_schema) dataset = QcTable(input_csv, schema=schema) datasetreport = TableReport(dataset, threshold=outlier) # Apply data cleaning corrections? if clean: datasetreport.apply_corrections() datasetreport.save_corrected(correctedcsvfile) if datasetreport.isvalid: LOGGER.info('The dataset has is valid.') else: LOGGER.info('CAUTION! The dataset is invalid!') # export the report if report == 'pdf': datasetreport.printpdf(pdfreportfile) elif report == 'xls': datasetreport.printexcel(xlsxreportfile)
def __init__(self, dbname, filepaths, schematype='source'): """" Arguments: :param dbname: tha name of the database :param tables: list of QcTable objects :param schematype: 'source' or 'target' """ self.__dbname = dbname self.__dbtype = 'CSV' self.__schematype = schematype tables = [QcTable(fpath, schema=None) for fpath in filepaths] # store QcTable objects in a dictionary with filename as key self.__tables = {table.filename: table for table in tables} # dublications self.__dublications = {} #{table.filename: int} self.__xml_elements = None self.__create_xml_element()
def infercsv(input_csv, schema_spec, sample_rows, max_levels, threshold, cde_file=None): """This command infers the schema of the <csv file> it and stored in <output file>. The <output file> either a json file following the frictionless data specs(https://specs.frictionlessdata.io/table-schema/) or an xlsx file following MIP Data Catalogue's format. """ filename = os.path.basename(input_csv) # Get the path of the csv file path = os.path.dirname(os.path.abspath(input_csv)) dataset_name = os.path.splitext(filename)[0] qcjsonfile = os.path.join(path, dataset_name + '_qcschema.json') dcxlsxfile = os.path.join(path, dataset_name + '_dcschema.xlsx') dataset = QcTable(input_csv, schema=None) # Is cde dictionary file available? if cde_file: cde_dict = CdeDict(cde_file) else: cde_dict = None infer = InferSchema(dataset, dataset_name, sample_rows, max_levels, cde_dict) # suggest cdes and concept paths if cde dictionary is available if cde_dict: infer.suggest_cdes(threshold=threshold) if schema_spec == 'dc': infer.export2excel(dcxlsxfile) elif schema_spec == 'qc': infer.expoct2qcjson(qcjsonfile)
def test_mipmapheaders(filepath, result): test = QcTable(filepath, schema=None) assert test.headers4mipmap == result
def test_schema_infer_tabulator(path, schema): table = QcTable(path, schema=None) table.infer(maxlevels=3) assert table.headers == ['id', 'name', 'diagnosis'] assert table.schema.descriptor == schema
def test_invalid_rows(): profiler = TableProfiler(SCHEMA_SIMPLE) table = QcTable('tests/test_datasets/simple_invalid_2.csv', schema=None) rows = table.read(cast=False) assert profiler.validate(rows, table.headers) == False assert profiler.invalid_rows == [5]
def test_vadidate_rows_with_invalids(path, schema, valid, rows_with_invalids): profiler = TableProfiler(schema) table = QcTable(path, schema=None) rows = table.read(cast=False) assert profiler.validate(rows, table.headers) == valid assert profiler.rows_with_invalids == rows_with_invalids
def test_column_values_exception(path, column_name): table = QcTable(path, schema=None) table.infer() with pytest.raises(QCToolException): assert table.column_values(column_name)
class MipCDEMapper(object): """Class for handling a simple (one to one) mapping task and creating the mapping xml for mipmap engine. :Arguments: :source path: the filepath of the source csv :cdescontroller: a CDEsController object containing info about the target CDE dataset :param sample_rows: number of sample rows for schema inferance of source csv :param maxlevels: total unique string values in a column to be considered as categorical type in schema inference of the source csv file. :param na_empty_strings_only: (boolean) If True, only the empty strings will be infered as NAs """ def __init__(self, source_path, cdescontroller, sample_rows, maxlevels, na_empty_strings_only=False): sourcedb = CsvDB('hospitaldb', [source_path], schematype='source') # get user home directory self.__homepath = os.getenv('HOME') # create mapping folder self.__mappingpath = os.path.join(self.__homepath, '.mipqctool', 'mapping') if not os.path.isdir(self.__mappingpath): os.makedirs(self.__mappingpath) # create the target mapping folder and xml folder if not os.path.isdir(os.path.join(self.__mappingpath, 'target')): os.mkdir(os.path.join(self.__mappingpath, 'target')) if not os.path.isdir(os.path.join(self.__mappingpath, 'xml')): os.mkdir(os.path.join(self.__mappingpath, 'xml')) # get the cde dataset name self.__target_dbname = cdescontroller.cdedataset_name # use it also as filename by adding .csv extension self.__target_filename = self.__target_dbname + '.csv' # # this will be used in the mapping execution by mipmap engine self.__target_folder = os.path.join(self.__mappingpath, 'target') self.__target_path = os.path.join(self.__target_folder, self.__target_filename) # create a csv file with the cde headers only cdescontroller.save_csv_headers_only(self.__target_path) self.__cdecontroller = cdescontroller # now we can create the CsvDB for the target schema targetdb = CsvDB(self.__target_dbname, [self.__target_path], schematype='target') # create the Mapping object self.__mapping = Mapping(sourcedb, targetdb) # store the QcSchema for the cde datset self.__cde_schema = cdescontroller.dataset_schema # With QcTable we can access medata about the source csv self.__srctbl = QcTable(source_path, schema=None) # inder the table schema self.__srctbl.infer(limit=sample_rows, maxlevels=maxlevels, na_empty_strings_only=na_empty_strings_only) self.__src_path = source_path self.__src_folder = os.path.dirname(source_path) # create table report for the source file self.__tblreport = TableReport(self.__srctbl) self.__src_filename = self.__srctbl.filename self.__src_headers = self.__srctbl.headers4mipmap srcname_no_ext = os.path.splitext(self.__src_path)[0] reportfilepath = srcname_no_ext + '_report.xlsx' self.__tblreport.printexcel(reportfilepath) # get the cde headers self.__cde_headers = cdescontroller.cde_headers self.__cde_mapped = self.__mapping.correspondences.keys() self.__cde_not_mapped = self.__cde_headers # get source vars for each cde correspondence self.__cde_corrs_sources = [] @property def sourcereport(self): return self.__tblreport @property def source_filename(self): return self.__src_filename @property def source_headers(self): return self.__src_headers @property def corr_sources(self): """source vars for each cde correspondence""" return self.__cde_corrs_sources @property def cde_filename(self): return self.__target_filename @property def cde_mapped(self): return self.__cde_mapped @property def cde_not_mapped(self): return self.__cde_not_mapped @property def cdecontroller(self): return self.__cdecontroller def suggest_corr(self, cdedict, threshold): """ Arguments: :param cdedict: CdeDict object :param threshold: 0-1 similarity threshold, below that not a cde is suggested """ cde_sugg_dict = {} # {cdecode:sourcecolumn} source_table = self.__srctbl.filename target_table = self.__target_filename sugg_replacemnts = { } # here will be stored the suggestions replacments {cdecode:[Replacemsnts]} #source_raw_headers = self.__mapping.sourcedb.get_raw_table_headers(source_table) # for each source column for name, columnreport in self.__tblreport.columnreports.items(): cde = cdedict.suggest_cde(columnreport, threshold=threshold) # check if a cde mapping already exist if cde and (cde.code not in cde_sugg_dict.keys()): cde_sugg_dict[ cde.code] = self.__mapping.sourcedb.raw_2_mipmap_header( self.__src_filename, columnreport.name) # suggest category replacements for cases where source col and cde are nominal sugg_reps = cdedict.suggest_replecements(cde.code, columnreport, threshold=threshold) if sugg_reps: sugg_replacemnts[cde.code] = sugg_reps for cdecode, source_var in cde_sugg_dict.items(): source_paths = [(source_table, source_var, None)] target_path = (target_table, cdecode, None) filename_column = '.'.join( [os.path.splitext(source_table)[0], source_var]) # lets see if this cde have value replacements suggestions, if so create the if statment if cdecode in sugg_replacemnts.keys(): expression = ifstr(filename_column, sugg_replacemnts[cdecode]) else: expression = filename_column # let's try to create the correspondence now try: self.__mapping.add_corr( source_paths=source_paths, target_path=target_path, expression=expression, replacements=sugg_replacemnts.get(cdecode)) # If a cde correspondance already exists then pass except MappingError: LOGGER.warning( 'found cde macth for source column "{}" but cde "{}" \ is not included in the selected cde pathology.'. format(source_var, cdecode)) self.__update_cde_mapped() def add_corr(self, cde, source_cols, expression): source_paths = [(self.__srctbl.filename, col, None) for col in source_cols] target_path = (self.__target_filename, cde, None) self.__mapping.add_corr(source_paths, target_path, expression) self.__update_cde_mapped() def remove_corr(self, cde): self.__mapping.remove_corr(cde) self.__update_cde_mapped() def update_corr(self, cde, source_cols, expression): source_paths = [(self.__srctbl.filename, col, None) for col in source_cols] target_path = (self.__target_filename, cde, None) self.__mapping.update_corr(source_paths, target_path, expression) self.__update_cde_mapped() def get_corr_expression(self, cde): return self.__mapping.correspondences[cde].expression def get_corr_replacements(self, cde): return self.__mapping.correspondences[cde].replacements def get_col_stats(self, mipmap_column) -> dict: """returns source columns stats. Arguments: :param mipmap_column: mipmap tranformed column name """ stats = {} raw_headers = self.__mapping.sourcedb.get_raw_table_headers( self.source_filename) # convert the column that is mipmap formated to the initial column name col = raw_headers[mipmap_column] col_report = self.__tblreport.columnreports[col] if col_report: stats['miptype'] = col_report.miptype stats['value_range'] = col_report.value_range if col_report.miptype in ['numerical', 'integer']: stats['mean'] = col_report.stats['mean'] stats['std'] = col_report.stats['std'] return stats else: return None def get_cde_info(self, mipmap_cde) -> dict: """returns cde type and values. Arguments: :param mipmap_cde: mipmap tranformed cde name """ cde_info = {} raw_cde_dict = self.__mapping.targetdb.get_raw_table_headers( self.cde_filename) raw_cde = raw_cde_dict[mipmap_cde] cde_schema = self.__cde_schema cdefield = cde_schema.get_field(raw_cde) cde_type = cdefield.miptype constraints = cdefield.constraints if cde_type == 'nominal' and constraints: con = constraints.get('enum') elif cde_type in ['numerical', 'integer'] and constraints: con = [constraints.get('minimum'), constraints.get('maximum')] else: con = None cde_info = {'miptype': cde_type, 'constraints': con} return cde_info def get_source_raw_header(self, mipmap_col): raw_headers = self.__mapping.sourcedb.get_raw_table_headers( self.source_filename) return raw_headers[mipmap_col] def get_cde_raw_header(self, mipmap_col): raw_headers = self.__mapping.targetdb.get_raw_table_headers( self.__target_filename) return raw_headers[mipmap_col] def get_cde_mipmap_header(self, raw_cde_header): raw_headers = self.__mapping.targetdb.get_raw_table_headers( self.__target_filename) mipmap_headers = {value: key for key, value in raw_headers.items()} return mipmap_headers[raw_cde_header] def run_mapping(self, output): xml_folder = os.path.join(self.__mappingpath, 'xml') xml_path = os.path.join(xml_folder, 'map.xml') with open(xml_path, 'w') as mapxml: mapxml.write(self.__mapping.xml_string) DockerMipmap(xml_folder, self.__src_folder, self.__target_folder, output) def save_mapping(self, filepath): with open(filepath, 'w') as mapxml: mapxml.write(self.__mapping.xml_string) def replace_function(self, column, replacments): """Returns a mipmap function string with encapsulated if statements for replacing given values of a column with predefined ones. This is used in a categorical/nominal column type Arguments: :param columnname: the column name(str) :param repls: list with Replacement namedtuples Replacement('source', 'target') """ return ifstr(column, replacments) def __update_cde_mapped(self): self.__cde_mapped = list(self.__mapping.correspondences.keys()) cde_not_mapped = self.__cde_headers.copy() cde_corrs_sources = OrderedDict() for cde in self.__cde_mapped: cde_not_mapped.remove(cde) source_paths = self.__mapping.correspondences[cde].source_paths pathstring = ', '.join([path[1] for path in source_paths]) cde_corrs_sources[cde] = pathstring self.__cde_corrs_sources = cde_corrs_sources self.__cde_not_mapped = cde_not_mapped
def test_actual_headers(path, result): table = QcTable(path, schema=None) assert table.actual_headers == result
'filled_25_49': 1, 'filled_0_24': 0 } F_ROWS_PER_COLUMN1 = { 4: 14, 3: 4, 2: 1, 1: 1 } # these 3 lines is for creating the initial # metadata json file with the dataset's schema TEST_TABLE = QcTable(DATASET2_PATH, schema=None) TEST_TABLE.infer() TEST_TABLE.schema.save(os.path.join(APP_PATH, 'test_datasets/test_dataset2.json')) @pytest.mark.parametrize('datasetpath, schemapath, result, total_rows', [ (DATASET1_PATH, METADATA1_PATH, F_ROWS_PER_COLUMN1, 20) ]) def test_calc_rows_per_column(datasetpath, schemapath, result, total_rows): with open(schemapath) as json_file: dict_schema = json.load(json_file) testreport = TableReport.from_disc(datasetpath, dict_schema) with pytest.warns(None) as recorded: assert testreport.total_rows == total_rows assert testreport._TableReport__tfilled_columns == result assert recorded.list == []