def __init__(self): self.schemas = Schema( [ Column("RA_Report #", [CanConvertValidation(int)]), Column("RA_CAERS Created Date", [CanCallValidation(self.parse_date)]), Column( "AEC_Event Start Date", [CanCallValidation(self.parse_date)], allow_empty=True, ), Column( "PRI_Product Role", [InListValidation(["Suspect", "Concomitant"])] ), Column("PRI_Reported Brand/Product Name"), Column("PRI_FDA Industry Code"), Column("PRI_FDA Industry Name"), Column("CI_Age at Adverse Event"), Column( "CI_Age Unit", [ InListValidation( ["Year(s)", "Decade(s)", "Month(s)", "Week(s)", "Day(s)"] ) ], ), Column("CI_Gender", [InListValidation(["Female", "Male"])]), Column("AEC_One Row Outcomes"), Column("SYM_One Row Coded Symptoms"), ] )
def create_schema(self) -> Schema: """ Create Pandas schema with all the necessary validation rules read in from config """ col_list = [] for column in self.__spreadsheet_def.keys(): validators = [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ] mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column) # Special cases for checking institutions/countries... if column == 'submitting_institution': validators.append( InListValidation([i.name for i in self.__institutions])) if column == 'country': validators.append( InListValidation([i.country for i in self.__institutions])) else: # Regex validation if self.__spreadsheet_def.get_regex(column): validators.append( MatchesPatternValidation( self.__spreadsheet_def.get_regex(column), message=self.__spreadsheet_def. get_regex_validation_message(column))) # Validate allowed values elif self.__spreadsheet_def.get_allowed_values(column): validators.append( InListValidation( self.__spreadsheet_def.get_allowed_values(column), case_sensitive=False)) # Field length validation max_len = self.__spreadsheet_def.get_max_length(column) if max_len and max_len > 0: validators.append( _StringLengthValidation( 'field length is greater than {} characters'. format(str(max_len)), max_len)) # Mandatory field validation col_list.append( Column(self.__spreadsheet_def.get_column_name(column), validators, allow_empty=not mandatory_field_flag)) return Schema(col_list)
def defSchema(): print('Define expected Schema') schema = Schema([ Column(name='id', validations=[IsDtypeValidation(np.object_)], allow_empty=False), Column(name='comment_text', validations=[IsDtypeValidation(np.object_)], allow_empty=False), Column(name='toxic', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='severe_toxic', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='obscene', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='threat', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='insult', validations=[InListValidation([0, 1])], allow_empty=False), Column(name='identity_hate', validations=[InListValidation([0, 1])], allow_empty=False) ]) return schema
def __init__(self): self.schemas = Schema( [ Column( "Given Name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column( "Family Name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("Age", [InRangeValidation(0, 120)]), Column("Sex", [InListValidation(["Male", "Female", "Other"])]), Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]), ] )
def _validate(self, diagnosis_df): schema = Schema([ Column('visit_dt', [ MatchesPatternValidation(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:00$') ]), Column('sex', [InListValidation(['M', 'K'])]), Column('icd10', [ MatchesPatternValidation(r'^[CDIJKMNRZ]{1}\d{1,2}.?\d{0,2}$') ]) ]) errors = schema.validate(diagnosis_df) for error in errors: self.Logger.error(error) if len(errors) > 0: exit()
def __init__(self): self.schemas = Schema( [ Column("id"), Column( "payer_name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("document_amount"), Column("payed_amount"), Column("payer_id_number"), Column( "payer_address", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("barcode"), Column("typable_line"), Column("number"), Column( "document_number", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("due_date", [DateFormatValidation("%m/%d/%y")]), Column( "city", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column( "state", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("zip_code"), Column("bank_answer_date"), Column("pdf_upload_date"), Column( "status", [InListValidation(["pending", "paid", "due", "error"])] ), Column("callback"), Column("object_id"), Column("extra"), ] )
BUILD_MAP = {'28': 'NCBI28', '29': 'NCBI29', '30': 'NCBI30', '31': 'NCBI31', '33': 'NCBI33', '34': 'NCBI34', '35': 'NCBI35', '36': 'NCBI36', '37': 'GRCh37', '38': 'GRCh38'} VALID_FILE_EXTENSIONS = [".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip"] GENERIC_VALIDATORS = { SNP_DSET: Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=True), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True), EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True), OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True), HR_DSET: Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=True), BETA_DSET: Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET])], allow_empty=True), EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=False), REF_DSET: Column(REF_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=True), FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET])], allow_empty=True), LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True) } SNP_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()} SNP_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=False) SNP_EMPTY_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
def clean(spreadsheet, rules, tempfolder, progress=None): if not os.path.exists(tempfolder): os.makedirs(tempfolder) filepath = tempfolder + '/temporary_file.xlsx' results = SpreadCheck(filepath) ################################ # LOAD SPREADSHEET TO DF # ################################ try: df = pd.read_csv(spreadsheet) except: try: df = pd.read_excel(spreadsheet) except: results.spreadsheet = None results.errors.append("ERROR: Unaccepted File Format") return results ################################ # SCHEMAS VALIDATION # ################################ schemas_list = [] df.fillna("NULL", inplace=True) for field, rule in rules.items(): schemas_list.append( Column(field, [InListValidation(list(rule['allowed_value_set']))])) schema = Schema(schemas_list) errors = schema.validate(df, columns=schema.get_column_names()) # reset progress bar if progress: progress.config(value=0) # increment progress bar #1 if progress: progress.step(20) ################################ # BUILD SHEET W FORMATTING # ################################ writer = pd.ExcelWriter(filepath, engine='xlsxwriter') # Skip row 1 headers so we can add manunally with formatting df.to_excel(writer, sheet_name='Sheet1', startrow=1, header=False, index=False) workbook = writer.book worksheet = writer.sheets['Sheet1'] ### WORKBOOK FORMATS ### yellow_highlight = workbook.add_format({'bg_color': '#FFEB9C'}) header = workbook.add_format({ 'bold': True, 'text_wrap': True, 'valign': 'top', 'fg_color': '#D7E4BC', 'border': 1 }) # increment progress bar #2 if progress: progress.step(20) # Set column widths worksheet.set_column(0, len(df.columns) - 1, 22) worksheet.set_default_row(hide_unused_rows=True) # Write the column headers with the defined format. for col_num, value in enumerate(df.columns.values): worksheet.write(0, col_num, value, header) # increment progress bar #3 if progress: progress.step(20) flag_rows = [] df_length = len(df) for error in errors: # Catch Key errors, like rules_file, data_file column name mismatch try: row = error.row + 1 column = df.columns.get_loc(error.column) except: results.spreadsheet = None results.errors.append("ERROR: " + str(error)) return results # Catch other errors. Not sure what yet. try: # If an autocorrect mapping exists for value, replace it. if error.value in rules[error.column]['autocorrect_dict'].keys(): worksheet.write( row, column, rules[error.column]['autocorrect_dict'][error.value]) results.corrections += 1 else: flag_rows.append(error.row) # If no autocorrect mapping exists, highlight and annotate the entry # Comments worksheet.write_comment(row, column, error.message) # Highlights worksheet.conditional_format(row, column, row, column, { 'type': 'no_errors', 'format': yellow_highlight }) results.flags += 1 except Exception as err: results.spreadsheet = None results.errors.append("ERROR: " + err) return results # increment progress bar #4 if progress: progress.step(20) # Hide Rows that don't contain annotations for i in range(df_length + 1): if i not in flag_rows: worksheet.set_row(i + 1, None, None, {'hidden': True}) # increment progress bar #5 if progress: progress.step(20) writer.save() results.messages.append("SUCCESS:") results.messages.append("{} entries corrected".format(results.corrections)) results.messages.append("{} entries flagged for review".format( results.flags)) return results
from collections import defaultdict import re from pandas_schema import Column, Schema from pandas_schema.validation import (LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, MatchesPatternValidation, CustomSeriesValidation, InRangeValidation, InListValidation, DateFormatValidation) study_schema = Schema([ Column('study_id', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the study_id column.') & ~InListValidation([''])]), Column('pi_name', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_name column.') & ~InListValidation([''])]), Column('sample_type', [InListValidation(['wmgx', 'wmtx', '16S', 'other'])]), Column('bioproject_accession', [InListValidation(['']) | MatchesPatternValidation(r'PRJ\w+\d+')]), Column('geo_loc_name', [InListValidation(['']) | MatchesPatternValidation(r'\w+:\w+:\w+')]), Column('analysis_desc', [InListValidation(['']) | CanConvertValidation(str)]), Column('sequencing_facility', [LeadingWhitespaceValidation()]), Column('env_biom', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]), Column('env_feature', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]), Column('env_material', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]), Column('host_tissue_sampled', [InListValidation(['']) | MatchesPatternValidation(r'BTO:\d+')]), Column('animal_vendor', [LeadingWhitespaceValidation()]), Column('paired', [InListValidation(['true', 'false'])]), Column('paired_id', [InListValidation(['']) | MatchesPatternValidation(r'[a-zA-Z0-9_.]+')]) , Column('pi_email', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_email column.') & ~InListValidation([''])]) ])
import pandas as pd from pandas_schema import Column, Schema from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, InRangeValidation, \ DateFormatValidation, InListValidation schema = Schema([ Column('name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('title', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('salary', [InRangeValidation(0, 33000)]), Column('sex', [InListValidation(['F', 'M'])]), Column('date', [DateFormatValidation('%Y-%m-%d')]) ]) widths = [ 9, # name 19, # title 6, # salary 4, # sex 11, # date ] # read source data test_data = pd.read_fwf("data/fixed_width.txt", widths=widths) print('orig dataset') print(test_data) # data verification
schema = Schema([ Column('ID', []), Column('Title', []), Column('Authors', []), Column('Publication Venue', []), Column('Publication Date', [DateFormatValidation("%Y-%m-%d")]), Column('Abstract', []), Column('Link to PDF', []), Column('Type', []), Column('Keywords', []), Column('Tags', []), Column('Access', []), Column('Area', []), Column('Included in map', [InListValidation(["yes", "no"])]), Column('Ready for publication?', [InListValidation(["yes", "no"])]), Column('Validation', []), Column('Comment 1', []), Column('Author Comment 1', []), Column('Comment 2', []), Column('Author Comment 2', []), Column('Comment 3', []), Column('Author Comment 3', []), Column('Comment 4', []), Column('Author Comment 4', []) ]) def process_comments(row): row = row.tolist()
import pandas as pd from io import StringIO from pandas_schema import Column, Schema from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, MatchesPatternValidation, InRangeValidation, InListValidation schema = Schema([ Column('Given Name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('Family Name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('Age', [InRangeValidation(0, 120)]), Column('Sex', [InListValidation(['Male', 'Female', 'Other'])]), Column('Customer ID', [MatchesPatternValidation(r'\d{4}[A-Z]{4}')]) ]) test_data = pd.read_csv( StringIO('''Given Name,Family Name,Age,Sex,Customer ID Gerald ,Hampton,82,Male,2582GABK Yuuwa,Miyake,270,male,7951WVLW Edyta,Majewska ,50,Female,775ANSID ''')) errors = schema.validate(test_data) for error in errors: print(error)
import time import pandas as pd from pandas_schema import Column, Schema from pandas_schema.validation import DateFormatValidation, MatchesPatternValidation, InListValidation pattern_id = r'^-?\d{1,16}$' # Number / integer - up to 16 pattern_dec = r'^-?\d*\.\d{1,2}$' pattern_geo = r'^-?\d*\.\d{1,20}$' # geo location / decimal with up to 18 decimal place pattern_date = r'%Y-%m-%d %H:%M:%S' # Timestamp yyyy-MM-dd HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01 00:00:07 taxiRide_schema = Schema([ Column('rideId', [MatchesPatternValidation(pattern_id)]), Column('isStart', [InListValidation(['START', 'END'])]), Column('endTime', [DateFormatValidation(pattern_date)]), Column('startTime', [DateFormatValidation(pattern_date)]), Column('startLon', [MatchesPatternValidation(pattern_geo)]), Column('startLat', [MatchesPatternValidation(pattern_geo)]), Column('endLon', [MatchesPatternValidation(pattern_geo)]), Column('endLat', [MatchesPatternValidation(pattern_geo)]), Column('passengerCnt', [MatchesPatternValidation(pattern_id)]) ], ordered=True) taxiFare_schema = Schema([ Column('rideId', [MatchesPatternValidation(pattern_id)]), Column('taxiId', [MatchesPatternValidation(pattern_id)]), Column('driverId', [MatchesPatternValidation(pattern_id)]), Column('startTime', [DateFormatValidation(pattern_date)]), Column('paymentType', [InListValidation(['CSH', 'CRD', 'NOC', 'DIS', 'UNK'])]), Column('tip', [MatchesPatternValidation(pattern_dec)]),
'atrialfib', # ATCDs 'stroke_pre', 'tia_pre', 'ich_pre', # Treatment 'treat_antipatelet', 'treat_anticoagulant', 'ivt_start', # needed to check if started before imaging 'treat_ivt', 'iat_start', # needed to check if started before imaging ] validation_schema = Schema([ Column('age', [InRangeValidation(1, 120)]), Column('sex', [InListValidation(['m', 'f'])]), Column('height', [InRangeValidation(50, 300)]), Column('weight', [InRangeValidation(10, 400)]), Column('onset_known', [InListValidation(['yes', 'no', 'wake_up'])]), Column('Firstimage_date', [DateFormatValidation('%Y-%m-%d %H:%M:%S')]), Column('onset_time', [DateFormatValidation('%Y-%m-%d %H:%M:%S')]), Column('NIH admission', [InRangeValidation(0, 43)]), Column('bp_syst', [InRangeValidation(0, 300)]), Column('bp_diast', [InRangeValidation(0, 300)]), Column('glucose', [InRangeValidation(0.1, 30)]), Column('créatinine', [InRangeValidation(0.1, 1000)]), Column('hypertension', [InListValidation(['yes', 'no'])]), Column('diabetes', [InListValidation(['yes', 'no'])]), Column('hyperlipidemia', [InListValidation(['yes', 'no'])]), Column('smoking', [InListValidation(['yes', 'no'])]), Column('atrialfib', [InListValidation(['yes', 'no'])]),
def validate(df): d_error = {} list_bu = [x[0] for x in BU_CHOICES] list_rd = [x[0] for x in RD_CHOICES] list_dept = [x[0] for x in DEPT_CHOICES] list_hplevel = [x[0] for x in HPLEVEL_CHOICES] list_province = [x[0] for x in PROVINCE_CHOICES] list_title = [x[0] for x in TITLE_CHOICES] NullValidation = CustomElementValidation(lambda d: d is not np.nan, "该字段不能为空") schema = Schema([ Column("南北中国", [InListValidation(list_bu)]), Column("区域", [InListValidation(list_rd)]), Column("大区", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("地区经理", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("负责代表", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column( "医院编码", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation, MatchesPatternValidation(r"^[H]{1}(\d){9}$"), ], ), Column("医院全称", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("省/自治区/直辖市", [InListValidation(list_province)]), Column("是否双call", [InListValidation(["是", "否"])]), Column("医院级别", [InListValidation(list_hplevel)]), Column("开户进展", [InListValidation(["已开户", "未开户"])]), Column("客户姓名", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), IsDistinctValidation() ]), Column("所在科室", [InListValidation(list_dept)]), Column("职称", [InListValidation(list_title)]), Column("月出诊次数(半天计)", [CanConvertValidation(int), InRangeValidation(0, 63)]), Column("每半天\n门诊量", [CanConvertValidation(int), InRangeValidation(0, )]), Column("相关病人\n比例(%)\n建议比例:40%-80%", [CanConvertValidation(int), InRangeValidation(0, 101)]), Column("备注"), ]) errors = schema.validate(df.loc[:, COL]) for error in errors: str_warning = str(error) for term in D_TRANSLATE: str_warning = str_warning.replace(term, D_TRANSLATE[term]) findword = r": [0-9]\d*" str_warning = re.sub(findword, row_refined, str_warning) d_error[str_warning] = "<br>" d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")} d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")} d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")} d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")} d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")} d_error = {**d_error, **check_hplevel_with_dept(df)} # 检查医院级别和所在科室是否出现矛盾 return d_error
Column("Chr15", [CanConvertValidation(float)]), Column("Chr16", [CanConvertValidation(float)]), Column("Chr17", [CanConvertValidation(float)]), Column("Chr18", [CanConvertValidation(float)]), Column("Chr19", [CanConvertValidation(float)]), Column("Chr20", [CanConvertValidation(float)]), Column("Chr21", [CanConvertValidation(float)]), Column("Chr22", [CanConvertValidation(float)]), Column("ChrX", [CanConvertValidation(float)]), Column("ChrY", [CanConvertValidation(float)]), Column("FF_Formatted", [CanConvertValidation(float)]), Column("FFY", [CanConvertValidation(float)]), Column("FFX", [CanConvertValidation(float)]), Column("DuplicationRate", [CanConvertValidation(float)]), Column("Bin2BinVariance", [CanConvertValidation(float)]), Column("UnfilteredCNVcalls", [CanConvertValidation(float)]), Column("CNVSegment", [InListValidation(["Found"])]), Column("Flowcell", []), Column("SequencingDate", []), Column("Median_13", [CanConvertValidation(float)]), Column("Median_18", [CanConvertValidation(float)]), Column("Median_21", [CanConvertValidation(float)]), Column("Median_X", [CanConvertValidation(float)]), Column("Median_Y", [CanConvertValidation(float)]), Column("Stdev_13", [CanConvertValidation(float)]), Column("Stdev_18", [CanConvertValidation(float)]), Column("Stdev_21", [CanConvertValidation(float)]), Column("Stdev_X", [CanConvertValidation(float)]), Column("Stdev_Y", [CanConvertValidation(float)]), ])
def main(): # Parse input arguments parser = get_parser() args = parser.parse_args() data_path = args.path_in path_tsv = os.path.join(data_path, 'participants.tsv') tsv_file = pd.read_csv(path_tsv, sep='\t') list_subj = [ name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub') ] df = pd.DataFrame(tsv_file) list_tsv_participants = df['participant_id'].tolist() missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants)) missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj)) if missing_subjects_tsv: # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv) print('\nWarning missing following subjects from participants.tsv: ') missing_subjects_tsv.sort() pprint(missing_subjects_tsv) if missing_subjects_folder: # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder) print( '\nWarning missing data for subjects listed in participants.tsv: ') missing_subjects_folder.sort() pprint(missing_subjects_folder) for dirName, subdirList, fileList in os.walk(data_path): for file in fileList: if file.endswith('.nii.gz'): originalFilePath = os.path.join(dirName, file) jsonSidecarPath = os.path.join(dirName, file.split(".")[0] + '.json') if os.path.exists(jsonSidecarPath) == False: print("Missing jsonSidecar: " + jsonSidecarPath) # Checking participants.tsv contents schema = Schema([ Column('participant_id', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('sex', [InListValidation(['M', 'F'])]), Column('age', [InRangeValidation(18, 60)]), Column('height', [MatchesPatternValidation(r"[0-9]|-")]), Column('weight', [MatchesPatternValidation(r"[0-9]|-")]), Column('date_of_scan', [ DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-") ]), Column('institution_id', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('institution', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('manufacturer', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('manufacturers_model_name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('receive_coil_name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('software_versions', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('researcher', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), ]) errors = schema.validate(tsv_file) print('\nChecking the contents of participants.tsv') if not errors: print("--> all good 👍") else: for error in errors: print(error)
Column(PVAL_DSET, [ CanConvertValidation(DSET_TYPES[PVAL_DSET]), InInclusiveRangeValidation(0, 1) ], allow_empty=False), BETA_DSET: Column(BETA_DSET, [CanConvertValidation(float)], allow_empty=True), SNP_DSET: Column(SNP_DSET, [ CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation( r'^chr[0-9XY]+_[0-9]+_[ACTGNactgn]+_[ACTGNactgn]+|LONG_STRING$') ], allow_empty=True), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False), BP_DSET: Column(BP_DSET, [ CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999) ], allow_empty=False), EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGNactgn]+|LONG_STRING$')], allow_empty=True), OTHER_DSET: Column(OTHER_DSET, [MatchesPatternValidation(r'^[ACTGNactgn]+|LONG_STRING$')], allow_empty=True), FREQ_DSET:
def validate_and_annotate(self, file_path=''): ################################ # SCHEMAS VALIDATION # ################################ # FIX ME: this is a really janky way to do this. Passing empty string for file_path... # is a temp fix for review_changes re-validate data button causing an infinite loop when calling this with an argument :( if not file_path: file_path = self.dirname + '/for_review/intermediate_hospital_data.xlsx' try: df = pd.read_csv(file_path) except: try: df = pd.read_excel(file_path) except: print("UNACCEPTED FILE FORMAT") df.fillna("NULL", inplace=True) #~ print(file_path) #~ print(df.info) schema = Schema([ Column('N95PlanFitTested', [InListValidation(['Y', 'N', 'NULL'])]), Column('PARPsPlanTrained', [InListValidation(['Y', 'N', 'NULL'])]), ]) errors = schema.validate(df, columns=schema.get_column_names()) ####################################### # Build excel worksheet w formatting ####################################### save_path2 = self.dirname + '/for_review/' if not os.path.exists(save_path2): os.makedirs(save_path2) writer = pd.ExcelWriter(save_path2 + 'intermediate_hospital_data.xlsx', engine='xlsxwriter') # Skip row 1 headers so we can add manunally with formatting df.to_excel(writer, sheet_name='Sheet1', startrow=1, header=False, index=False) workbook = writer.book worksheet = writer.sheets['Sheet1'] ### WORKBOOK FORMATS ### yellow_highlight = workbook.add_format({'bg_color': '#FFEB9C'}) header = workbook.add_format({ 'bold': True, 'text_wrap': True, 'valign': 'top', 'fg_color': '#D7E4BC', 'border': 1 }) ######################## # Set column widths worksheet.set_column('A:II', 30) worksheet.set_default_row(hide_unused_rows=True) # Write the column headers with the defined format. for col_num, value in enumerate(df.columns.values): worksheet.write(0, col_num, value, header) # for storing error row numbers while we iterate thru error object # will use for hiding rows error_rows = [] df_length = len(df) for error in errors: error_rows.append(error.row) row = error.row + 1 column = df.columns.get_loc(error.column) # Comments worksheet.write_comment(row, column, error.message) # Highlights worksheet.conditional_format(row, column, row, column, { 'type': 'no_errors', 'format': yellow_highlight }) #~ print(error_rows); # Hide Rows that don't contain errors for i in range(df_length + 1): if i not in error_rows: worksheet.set_row(i + 1, None, None, {'hidden': True}) writer.save() # Pop up self.review_changes()
'smoking', 'atrialfib', # ATCDs 'stroke_pre', 'tia_pre', 'ich_pre', # general Treatment 'treat_antipatelet', 'treat_anticoagulant', # acute treatment 'treat_ivt_before_ct', # ie ivt was started before imaging ] validation_schema = Schema([ Column('age', [InRangeValidation(1, 120)]), Column('sex', [InListValidation([0, 1])]), Column('height', [InRangeValidation(50, 300)]), Column('weight', [InRangeValidation(10, 400)]), Column('onset_known_yes', [InListValidation([0, 1])]), Column('onset_known_wake_up', [InListValidation([0, 1])]), Column('onset_to_ct', [InRangeValidation(0, 43800)]), # onset to CT time max is 1 month Column('NIH admission', [InRangeValidation(0, 43)]), Column('bp_syst', [InRangeValidation(0, 300)]), Column('bp_diast', [InRangeValidation(0, 300)]), Column('glucose', [InRangeValidation(0.1, 30)]), Column('créatinine', [InRangeValidation(0.1, 1000)]), Column('hypertension', [InListValidation([0, 1])]), Column('diabetes', [InListValidation([0, 1])]), Column('hyperlipidemia', [InListValidation([0, 1])]), Column('smoking', [InListValidation([0, 1])]),