def from_disc(cls, csvpath, dict_schema, schema_type='qc', id_column=1, threshold=3): """ Constucts a TableReport from a csvfile and a given schema. Arguments: :param csvpath: string, the csv filepath :param schema: dictionary describing the csv schema :param schema_type: 'qc' for frictionless type, 'dc' for Data Catalogue type json schema :param id_column: column number of dataset's primary key (id) :param threshold: outlier threshold - (mean - threshold * std, mean + threshold * std) outside this length, a numerical value is considered outlier """ if schema_type == 'qc': dataset_schema = QcSchema(dict_schema) elif schema_type == 'dc': LOGGER.info( 'Transating from Data Catalogue to Frictionless json format...' ) qcdict_schema = FrictionlessFromDC(dict_schema).qcdescriptor dataset_schema = QcSchema(qcdict_schema) dataset = QcTable(csvpath, schema=dataset_schema) return cls(dataset, id_column=id_column, threshold=threshold)
def reorganizefiles(self, output): """reorganize the dcm files in a folder structure for LORIS import pipeline. Arguments: :param output: output folder """ LOGGER.info('Reorganizing files for LORIS pipeline into folder: %s' % output) for patient in self.patients: patientid = patient.patientid patdir = os.path.join(output, patientid) if not os.path.exists(patdir): os.mkdir(patdir) study_count = 0 for study in patient.studies: study_count += 1 d = [patientid, str(study_count)] studydir = os.path.join(patdir, '_'.join(d)) if not os.path.exists(studydir): os.mkdir(studydir) for seq in study.sequences: for dicom in seq.dicoms: sourcepath = dicom.filepath destpath = os.path.join(studydir, dicom.filename) shutil.copy(sourcepath, destpath)
def suggest_cde(self, columnreport, threshold=0.6): """Suggests the most similar CDE for the column. Arguments: :param columnreport: ColumnReport object with info of a datset column :param threshold: 0-1 similarity threshold, below that not a cde is suggested :returns: a CdeVariable object """ name = columnreport.name val_range = columnreport.value_range mip_type = columnreport.miptype LOGGER.debug('The incoming column name is: {}'.format(name)) # select cdes with tha same type and calculate similarity canditates = [ cde for cde in self.__cdes.values() if cde.miptype == mip_type ] LOGGER.debug('Number of cdes with miptype {} is: {}'.format( mip_type, len(canditates))) if canditates: canditates.sort(key=lambda x: x.similarity(name, val_range), reverse=True) canditate = canditates[0] similarity = canditate.similarity(name, val_range) LOGGER.debug( 'The simirarity between "{}" and cde "{}" is: {}'.format( name, canditate.code, similarity)) if similarity >= threshold: return canditate else: LOGGER.info( 'No cde match found for incoming column "{}"'.format(name)) return None else: LOGGER.info( 'No cde match found for incoming column "{}"'.format(name)) return None
def __init__(self, rootfolder, username): """ Arguments: :param rootfolder: folder path with DICOMs subfolders :param username: str with the username """ start_time = time.time() self.reportdata = None self.rootfolder = rootfolder self.subfolders = getsubfolders(rootfolder) self.username = username self.dataset = { 'version': __version__, 'date_qc_ran': datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'), 'username': username, 'dicomfolder': str(os.path.abspath(rootfolder)) } # files that are not DICOMS (.dcm) # list of (folder, filename) self.__notprocessed = [] # MRISequences objects self.__invalidseq = [] # MRIPatient objects self.__patients = [] # statistic info self.__totalvalidseq = 0 self.__totalstudies = 0 self.__totalinvaliddicoms = 0 self.__seriesdescriptions_valid = set() self.__seriesdescriptions_invalid = set() self.__seqperpatient_distr = { 's1': 0, 's2': 0, 's3-s5': 0, 's6more': 0, } self.__patientid_with_invalids = set() # Read all the dcm files and calc QC stats self.__readicoms_parallel(mp.cpu_count() + 1) self.__collect_stats() LOGGER.debug('Dicom analysis running time: %s seconds' % (time.time() - start_time)) LOGGER.debug('Folders read: %i' % len(self.subfolders)) LOGGER.debug('Total MRI sequences found: %i' % (self.totalvalidsequences + self.totalinvalidsequences)) LOGGER.info('Patients with good seq: %i' % self.totalpatients) LOGGER.info('Total visits: %i' % self.totalvisits) LOGGER.info('Good seq: %i' % self.totalvalidsequences) LOGGER.info('Bad seq: %i' % self.totalinvalidsequences) LOGGER.info('Files not processed: %i' % self.totalbadfiles)
def save_schema(self): self.save_button.config(state='disabled') if self.schema_output.get() == 1: output_file = tkfiledialog.asksaveasfilename(title='enter file name', filetypes=(('excel files', '*.xlsx'), ('all files', '*.*'))) else: output_file = tkfiledialog.asksaveasfilename(title='enter file name', filetypes=(('json files', '*.json'), ('all files', '*.*'))) if output_file: warningtitle = 'Can not save the schema' if not self.dname: tkmessagebox.showwarning(warningtitle, 'Please, select dataset file') max_categories = int(self.inf_opt_frame.max_categories.get()) sample_rows = int(self.inf_opt_frame.sample_rows.get()) na_empty_strings_only = self.inf_opt_frame.na_empty_strings_only.get() if self.inf_opt_frame.cde_dict: infer = InferSchema.from_disc(self.datasetpath, sample_rows=sample_rows, maxlevels=max_categories, cdedict=self.inf_opt_frame.cde_dict, na_empty_strings_only=na_empty_strings_only) if self.inf_opt_frame.thresholdstring.get() == '': threshold = 0.6 else: threshold = float(self.inf_opt_frame.thresholdstring.get()) LOGGER.info('CDE similarity threshold: %f' % threshold) infer.suggest_cdes(threshold=threshold) infer.export2excel(output_file) LOGGER.info('Schema file has been created successully') tkmessagebox.showinfo( title='Status info', message='Schema file has been created successully' ) else: infer = InferSchema.from_disc(self.datasetpath, sample_rows=sample_rows, maxlevels=max_categories, cdedict=None, na_empty_strings_only=na_empty_strings_only) if self.schema_output.get() == 1: infer.export2excel(output_file) LOGGER.info('Schema file has been created successully') tkmessagebox.showinfo( title='Status info', message='Schema file has been created successully' ) else: infer.expoct2qcjson(output_file) LOGGER.info('Schema file has been created successully') tkmessagebox.showinfo( title='Status info', message='Schema file has been created successully' ) self.save_button.config(state='normal')
def add_replacement_expr(self): reps = self.func_replace_trg_listbox.get(0, tk.END) replacemnts = [] source_col = self.parent.csv_name.replace( ".csv", "") + '.' + self.selected_column LOGGER.debug('the repleacements are: {}'.format(reps)) if len(reps) > 0: self.expressions_text.delete("1.0", "end-1c") for rep in reps: s = rep.split('->') replacemnts.append(Replacement(s[0], s[1])) expr = ifstr(source_col, replacemnts) LOGGER.info('the expression is: {}'.format(expr)) self.expressions_text.insert('1.0', expr)
def suggest_corrs(self): self.suggest_btn.config(state='disabled') warningtitle = 'Could not make suggestions' if self.infer_opt_frame.cde_dict: if self.infer_opt_frame.thresholdstring.get() == '': threshold = 0.6 else: threshold = float(self.infer_opt_frame.thresholdstring.get()) self.cdemapper.suggest_corr(self.infer_opt_frame.cde_dict, threshold=threshold) LOGGER.info('Done with the correspondences suggestions..Updating listbox...') self.update_listbox_corr() else: tkmessagebox.showwarning(warningtitle, 'Could not find the CDE dictionary file') self.suggest_btn.config(state='normal')
def __readicoms_parallel(self, processes): """Read all the dicoms using multiprocessing.""" output = [] if len(self.subfolders) > processes: LOGGER.info('dicom parallel processing with {} Processes'.format( processes)) slices = list(splitdict(self.subfolders, processes)) with Pool(processes) as p: output = p.map(self.readicoms_chunks, slices) for chunk in output: self.__patients += chunk['patients'] self.__invalidseq += chunk['invalidseq'] self.__notprocessed += chunk['notprocessed'] else: LOGGER.info('Single core processing...') output = self.readicoms_chunks(self.subfolders) self.__patients += output['patients'] self.__invalidseq += output['invalidseq'] self.__notprocessed += output['notprocessed']
def __create_db_container(self): """Creates a postgres 9.6 container. """ self.__check_db_container(mode='running') self.__check_db_container(mode='exist') if self.__is_db_running: LOGGER.info('db container ({}) is already up and' ' running. Skipping creation step...'.format( self.__db_cont_name)) self.__remove_create_db() pass elif self.__is_db_exist and not self.__is_db_running: LOGGER.info('db container({}) already exists. ' 'Restarting db container'.format(self.__db_cont_name)) subprocess.run(['docker', 'restart', self.__db_cont_name]) time.sleep(10) self.__remove_create_db() else: # create the db container LOGGER.debug('Creating db container with name {}'.format( self.__db_cont_name)) arg_port = ['-p', '{}:5432'.format(self.__dbport)] arg_name = ['--name', self.__db_cont_name] arg_env1 = ['-e', 'POSTGRES_PASSWORD={}'.format(self.__dbpassword)] arg_env2 = ['-e', 'POSTGRES_USER={}'.format(self.__dbuser)] arg_img = ['-d', self.__db_image] command2 = ['docker', 'run' ] + arg_port + arg_name + arg_env1 + arg_env2 + arg_img try: createproc = subprocess.run(command2) time.sleep(50) self.__remove_create_db() except subprocess.CalledProcessError: LOGGER.warning( 'There was an error while attempting creating the db container.' ) raise DockerExecError( 'There was an error while attempting creating the db container.' )
def get_all_cdes(self): LOGGER.info( 'Trying to retrive cde metadata from Data Cataloge. Using DC url: {}' .format(DC_DOMAIN)) all_pathologies_url = ''.join([DC_DOMAIN, DC_SUBDOMAIN_ALLPATHOLOGIES]) r = requests.get(all_pathologies_url) self.dc = DcConnector(r) if self.dc.status_code == 200: self.dc_combox1.config(values=self.dc.pathology_names) elif 500 <= self.dc.status_code <= 599: LOGGER.info('Data Cataloge server internal error.') elif 400 <= self.dc.status_code <= 499: LOGGER.info( 'Data Cataloge could not be reach!. Please check DC_DOMAIN in config url' )
def csv(input_csv, schema_json, clean, metadata, report, outlier): """This command produces a validation report for <csv file>. The report file is stored in the same folder where <csv file> is located. <schema json> file MUST be compliant with frirctionless data table-schema specs(https://specs.frictionlessdata.io/table-schema/) or with Data Catalogue json format. """ filename = os.path.basename(input_csv) # Get the path of the csv file path = os.path.dirname(os.path.abspath(input_csv)) dataset_name = os.path.splitext(filename)[0] pdfreportfile = os.path.join(path, dataset_name + '_report.pdf') xlsxreportfile = os.path.join(path, dataset_name + '_report.xlsx') correctedcsvfile = os.path.join(path, dataset_name + '_corrected.csv') # read the json file with the csv schema with open(schema_json) as json_file: dict_schema = json.load(json_file) # check metadata json type is Data Catalogue specs if metadata == 'dc': LOGGER.info( 'Transating from Data Catalogue to Frictionless json format...') dict_schema = FrictionlessFromDC(dict_schema).qcdescriptor schema = QcSchema(dict_schema) dataset = QcTable(input_csv, schema=schema) datasetreport = TableReport(dataset, threshold=outlier) # Apply data cleaning corrections? if clean: datasetreport.apply_corrections() datasetreport.save_corrected(correctedcsvfile) if datasetreport.isvalid: LOGGER.info('The dataset has is valid.') else: LOGGER.info('CAUTION! The dataset is invalid!') # export the report if report == 'pdf': datasetreport.printpdf(pdfreportfile) elif report == 'xls': datasetreport.printexcel(xlsxreportfile)
def extractSColumnsFunctions(expr, ddlFunctions, ddlColumns): try: CorrespondenceParser.firstCheckParentheses( expr) #If it fails it will raise an Exception except SyntaxError as se: LOGGER.info(str(se)) raise ExpressionError(str(se)) try: columns = CorrespondenceParser.extractColumnsList(expr, ddlColumns) except ColumnNameError as cne: LOGGER.info(str(cne)) raise ExpressionError(str(cne)) try: CorrespondenceParser.extractSColumnsFunctionsR( expr, ddlFunctions, expr) except FunctionNameError as fne: LOGGER.info(str(fne)) raise ExpressionError(str(fne)) except ArgsFunctionError as afe: LOGGER.info(str(afe)) raise ExpressionError(str(afe)) #return columns, functions return columns
def __run_mapping(self): env = {} env['mipmap_map'] = self.__mapping env['mipmap_source'] = self.__source env['mipmap_output'] = self.__output env['mipmap_pgproperties'] = self.__dbprop env['mipmap_script'] = self.__scriptpath env['mipmap_target'] = self.__target env['mipmap_db'] = MIPMAP_DB_CONTAINER #LOGGER.debug(os.getenv('mipmap_pgproperties')) self.__template.stream(env).dump(self.__dcompose) if self.__is_mipmap_container_exist: LOGGER.info('Removing previous mipmap container...') remove_proc = subprocess.run(['docker', 'rm', self.__name]) arguments = ['docker-compose', '-f', self.__dcompose, 'up', 'mipmap_etl'] process = subprocess.Popen(arguments, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) LOGGER.info('Running docker-compose...') output, _ = process.communicate() LOGGER.info(output.decode('utf-8'))
def infer(self, rows, headers=1, confidence=0.75, maxlevels=10, na_empty_strings_only=False): # Get headers if isinstance(headers, int): headers_row = headers while True: headers_row -= 1 headers = rows.pop(0) if not headers_row: break elif not isinstance(headers, list): headers = [] # Get descriptor guesser = _QcTypeGuesser() resolver = _QcTypeResolver() descriptor = {'fields': []} type_matches = {} unique_values = {} missingvalues = set() for header in headers: descriptor['fields'].append({'name': header}) LOGGER.info('{} of sample rows are used for table schema inference'.format(len(rows))) for row_number, row in enumerate(rows): # Normalize rows with invalid dimensions for sanity row_length = len(row) headers_length = len(headers) if row_length > headers_length: row = row[:len(headers)] if row_length < headers_length: diff = headers_length - row_length fill = [''] * diff row = row + fill # build a column-wise lookup of type matches for index, value in enumerate(row): # remove leading and trailing whitespacing value = value.strip() rv = guesser.infer(value, na_empty_strings_only=na_empty_strings_only) name = rv[0] pattern = rv[1] # collect unique values for possible nominal variable if pattern == 'text' or name == 'integer': if unique_values.get(index): unique_values[index].add(value) else: unique_values[index] = set() unique_values[index].add(value) # collect the nans elif pattern == 'nan': missingvalues.add(value) if type_matches.get(index): type_matches[index].append(rv) else: type_matches[index] = [rv] # choose a type/format for each column based on the matches for index, results in type_matches.items(): uniques = unique_values.get(index) rv = resolver.get(results, uniques, maxlevels, confidence) descriptor['fields'][index].update(**rv) # case missing values have been found if len(missingvalues) > 0: # add the default missing value in any case missingvalues.update(set(config.DEFAULT_MISSING_VALUES)) # sort missing values missing_sorted = list(missingvalues) missing_sorted.sort() # update the missing values descriptor['missingValues'] = list(missing_sorted) # case missing values not found use default elif len(missingvalues) == 0: descriptor['missingValues'] = config.DEFAULT_MISSING_VALUES # Save descriptor self._Schema__current_descriptor = descriptor self.__build() self.__infered = True return descriptor
def on_select_version(self, event): LOGGER.info('Retrieving metadata json') if self.dc and self.dc.status_code == 200: self.dc_json = self.dc.getjson(self.selected_pathology.get(), self.selected_version.get())
def createreport(self): self.button_exec.config(state='disabled') LOGGER.info('Checking if the necessary fields are filled in...') warningtitle = 'Cannot create report' if not self.dname: tkmessagebox.showwarning(warningtitle, 'Please, select dataset file') #elif not self.d_headers_cbox.get(): # tkmessagebox.showwarning(warningtitle, # 'Please, select ColumnID') elif self.md_frame.from_disk.get() and not self.md_frame.metafilepath: tkmessagebox.showwarning(warningtitle, 'Please, select metadata file') elif self.md_frame.from_dc.get() and not self.md_frame.dc_json: tkmessagebox.showwarning(warningtitle, 'Could not get metadata from Data Cataloge') elif not self.__reportfilepath: tkmessagebox.showwarning(warningtitle, 'Please, select report file first') else: try: threshold = float(self.outlier_threshold.get()) LOGGER.info('Outlier threshold: %s' % self.outlier_threshold.get()) except ValueError: LOGGER.warning('Could not retrieve outlier threshold. \ Setting it to default value: 3') threshold = 3 LOGGER.info('Everything looks ok...') #filedir = self.__exportfiledir #basename = os.path.splitext(self.dname)[0] #pdfreportfile = os.path.join(filedir, basename + '_report.pdf') #xlsxreportfile = os.path.join(filedir, basename + '_report.xlsx') schema_type = 'qc' if self.md_frame.from_disk.get(): LOGGER.info('Retrieving Metadata from localdisk...') LOGGER.info('Using metadata file: %s' % self.md_frame.metafilepath) with open(self.md_frame.metafilepath) as json_file: dict_schema = json.load(json_file) if self.md_frame.json_type.get() == 2: schema_type = 'dc' elif self.md_frame.from_dc.get(): LOGGER.info('Retrieving Metadata from Data Catalogue...') LOGGER.info('Selected pathology is {}, CDE version: {}'.format( self.md_frame.selected_pathology.get(), self.md_frame.selected_version.get()) ) dict_schema = self.md_frame.dc_json schema_type = 'dc' try: self.reportcsv = TableReport.from_disc(self.datasetpath, dict_schema=dict_schema, schema_type=schema_type, threshold=threshold)#id_column=self.d_headers_cbox.current()) if self.reportcsv.isvalid: LOGGER.info('The dataset is valid.') else: LOGGER.info('CAUTION! The dataset is invalid!') # Perform Data Cleaning? #if self.cleaning.get(): # self.reportcsv.apply_corrections() #self.reportcsv.save_corrected(correctedcsvfile) # Create the report if self.report_type.get() == 1: self.reportcsv.printexcel(self.__reportfilepath) else: self.reportcsv.printpdf(self.__reportfilepath) #self.label_export2.config(text=filedir) tkmessagebox.showinfo( title='Status info', message='Reports have been created successully' ) self.show_sugg_button.config(state='normal') self.clean_button.config(state='normal') except QCToolException as e: errortitle = 'Something went wrong!' tkmessagebox.showerror(errortitle, e) self.button_exec.config(state='normal')
def __init__(self, dcjson): # generates the tree structure of the loaded DC json LOGGER.info('Finding variable tree...') self.rootnode = Node(dcjson) self.__dc2qc()