def __init__(self): self.schemas = Schema( [ Column("id"), Column( "payer_name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("document_amount"), Column("payed_amount"), Column("payer_id_number"), Column( "payer_address", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("barcode"), Column("typable_line"), Column("number"), Column( "document_number", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("due_date", [DateFormatValidation("%m/%d/%y")]), Column( "city", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column( "state", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], ), Column("zip_code"), Column("bank_answer_date"), Column("pdf_upload_date"), Column( "status", [InListValidation(["pending", "paid", "due", "error"])] ), Column("callback"), Column("object_id"), Column("extra"), ] )
import pandas as pd from pandas_schema import Column, Schema from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, InRangeValidation, \ DateFormatValidation, InListValidation schema = Schema([ Column('name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('title', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('salary', [InRangeValidation(0, 33000)]), Column('sex', [InListValidation(['F', 'M'])]), Column('date', [DateFormatValidation('%Y-%m-%d')]) ]) widths = [ 9, # name 19, # title 6, # salary 4, # sex 11, # date ] # read source data test_data = pd.read_fwf("data/fixed_width.txt", widths=widths) print('orig dataset') print(test_data) # data verification
def main(): # Parse input arguments parser = get_parser() args = parser.parse_args() data_path = args.path_in path_tsv = os.path.join(data_path, 'participants.tsv') tsv_file = pd.read_csv(path_tsv, sep='\t') list_subj = [ name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub') ] df = pd.DataFrame(tsv_file) list_tsv_participants = df['participant_id'].tolist() missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants)) missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj)) if missing_subjects_tsv: # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv) print('\nWarning missing following subjects from participants.tsv: ') missing_subjects_tsv.sort() pprint(missing_subjects_tsv) if missing_subjects_folder: # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder) print( '\nWarning missing data for subjects listed in participants.tsv: ') missing_subjects_folder.sort() pprint(missing_subjects_folder) for dirName, subdirList, fileList in os.walk(data_path): for file in fileList: if file.endswith('.nii.gz'): originalFilePath = os.path.join(dirName, file) jsonSidecarPath = os.path.join(dirName, file.split(".")[0] + '.json') if os.path.exists(jsonSidecarPath) == False: print("Missing jsonSidecar: " + jsonSidecarPath) # Checking participants.tsv contents schema = Schema([ Column('participant_id', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('sex', [InListValidation(['M', 'F'])]), Column('age', [InRangeValidation(18, 60)]), Column('height', [MatchesPatternValidation(r"[0-9]|-")]), Column('weight', [MatchesPatternValidation(r"[0-9]|-")]), Column('date_of_scan', [ DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-") ]), Column('institution_id', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('institution', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('manufacturer', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('manufacturers_model_name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('receive_coil_name', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('software_versions', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), Column('researcher', [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]), ]) errors = schema.validate(tsv_file) print('\nChecking the contents of participants.tsv') if not errors: print("--> all good 👍") else: for error in errors: print(error)
schema = Schema([ Column( 'key', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column( 'sensor_id', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column( 'location', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column('lat', [MatchesPatternValidation(pattern_geo) ]), # Number / decimal with up to 16 decimal place Column('lon', [MatchesPatternValidation(pattern_geo) ]), # Number / decimal with up to 16 decimal place Column('timestamp', [DateFormatValidation('%Y-%m-%dT%H:%M:%S')]), # Timestamp yyyy-MM-dd'T'HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01T00:00:07 Column('pressure', [MatchesPatternValidation(pattern_dec)]), # Numbers / / decimal with 1 or 2 decimals (.00) Column('temperature', [ InRangeValidation(-146, 60), MatchesPatternValidation(r'^-?\d*\.\d{1,2}$') ]), # Number / decimal with upto 2 decimal place Column('humidity', [MatchesPatternValidation(pattern_dec)] ) # Numbers with 1 or 2 decimals (.00) ]) ### get data from File test_data = pd.read_csv("data/testCSV_short.csv")
# 1 ,2266 ,1140 ,42.738 ,23.272 ,2017-07-01T00:00:07 ,95270.27 ,23.46 ,62.48 start_time = time.time() pattern_id = r'^-?\d{1,16}$' pattern_dec = r'^-?\d*\.\d{1,2}$' pattern_geo = r'^-?\d*\.\d{1,16}$' schema = Schema([ Column('key', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column('sensor_id', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column('location', [MatchesPatternValidation(pattern_id)]), # Number / integer - up to 16 Column('lat', [MatchesPatternValidation(pattern_geo)]), # Number / decimal with up to 16 decimal place Column('lon', [MatchesPatternValidation(pattern_geo)]), # Number / decimal with up to 16 decimal place Column('timestamp', [DateFormatValidation('%Y-%m-%dT%H:%M:%S')]), # Timestamp yyyy-MM-dd'T'HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01T00:00:07 Column('pressure', [MatchesPatternValidation(pattern_dec)]), # Numbers / / decimal with 1 or 2 decimals (.00) Column('temperature', [InRangeValidation(-146, 60), MatchesPatternValidation(r'^-?\d*\.\d{1,2}$')]), # Number / decimal with upto 2 decimal place Column('humidity', [MatchesPatternValidation(pattern_dec)]) # Numbers with 1 or 2 decimals (.00) ]) ### get data from File print('load orig dataset from file') test_data = pd.read_csv("data/testCSV_short.csv") print('orig dataset') print(test_data) # data verification print('start data verification on orig dataset')
store.delete(key + "_output") break return result additional_context_fields = [ "Project name", "Project website", "Topic", "Main curator name", "Main curator e-mail" ] schema = Schema([ Column('ID', []), Column('Title', []), Column('Authors', []), Column('Publication Venue', []), Column('Publication Date', [DateFormatValidation("%Y-%m-%d")]), Column('Abstract', []), Column('Link to PDF', []), Column('Type', []), Column('Keywords', []), Column('Tags', []), Column('Access', []), Column('Area', []), Column('Comment 1', []), Column('Author Comment 1', []), Column('Comment 2', []), Column('Author Comment 2', []), Column('Comment 3', []), Column('Author Comment 3', []), Column('Comment 4', []), Column('Author Comment 4', [])
import pandas as pd from pandas_schema import Column, Schema from pandas_schema.validation import \ DateFormatValidation, \ LeadingWhitespaceValidation, \ TrailingWhitespaceValidation, \ CustomSeriesValidation, \ CanCallValidation, \ CanConvertValidation URL_ALL = "https://raw.githubusercontent.com/tryggvigy/CoronaWatchIS/master/data/covid_in_is_all.cvs" date_validator = Column('date', [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), DateFormatValidation("%Y-%m-%d") & CustomSeriesValidation( lambda x: x.is_monotonic_increasing and x.is_unique, 'date is not monotonic') ]) default_value_validators = [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ] schemas_by_key = { 'cases': Schema([ date_validator, Column('cases', [ *default_value_validators,
import time import pandas as pd from pandas_schema import Column, Schema from pandas_schema.validation import DateFormatValidation, MatchesPatternValidation, InListValidation pattern_id = r'^-?\d{1,16}$' # Number / integer - up to 16 pattern_dec = r'^-?\d*\.\d{1,2}$' pattern_geo = r'^-?\d*\.\d{1,20}$' # geo location / decimal with up to 18 decimal place pattern_date = r'%Y-%m-%d %H:%M:%S' # Timestamp yyyy-MM-dd HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01 00:00:07 taxiRide_schema = Schema([ Column('rideId', [MatchesPatternValidation(pattern_id)]), Column('isStart', [InListValidation(['START', 'END'])]), Column('endTime', [DateFormatValidation(pattern_date)]), Column('startTime', [DateFormatValidation(pattern_date)]), Column('startLon', [MatchesPatternValidation(pattern_geo)]), Column('startLat', [MatchesPatternValidation(pattern_geo)]), Column('endLon', [MatchesPatternValidation(pattern_geo)]), Column('endLat', [MatchesPatternValidation(pattern_geo)]), Column('passengerCnt', [MatchesPatternValidation(pattern_id)]) ], ordered=True) taxiFare_schema = Schema([ Column('rideId', [MatchesPatternValidation(pattern_id)]), Column('taxiId', [MatchesPatternValidation(pattern_id)]), Column('driverId', [MatchesPatternValidation(pattern_id)]), Column('startTime', [DateFormatValidation(pattern_date)]), Column('paymentType', [InListValidation(['CSH', 'CRD', 'NOC', 'DIS', 'UNK'])]), Column('tip', [MatchesPatternValidation(pattern_dec)]),
'ich_pre', # Treatment 'treat_antipatelet', 'treat_anticoagulant', 'ivt_start', # needed to check if started before imaging 'treat_ivt', 'iat_start', # needed to check if started before imaging ] validation_schema = Schema([ Column('age', [InRangeValidation(1, 120)]), Column('sex', [InListValidation(['m', 'f'])]), Column('height', [InRangeValidation(50, 300)]), Column('weight', [InRangeValidation(10, 400)]), Column('onset_known', [InListValidation(['yes', 'no', 'wake_up'])]), Column('Firstimage_date', [DateFormatValidation('%Y-%m-%d %H:%M:%S')]), Column('onset_time', [DateFormatValidation('%Y-%m-%d %H:%M:%S')]), Column('NIH admission', [InRangeValidation(0, 43)]), Column('bp_syst', [InRangeValidation(0, 300)]), Column('bp_diast', [InRangeValidation(0, 300)]), Column('glucose', [InRangeValidation(0.1, 30)]), Column('créatinine', [InRangeValidation(0.1, 1000)]), Column('hypertension', [InListValidation(['yes', 'no'])]), Column('diabetes', [InListValidation(['yes', 'no'])]), Column('hyperlipidemia', [InListValidation(['yes', 'no'])]), Column('smoking', [InListValidation(['yes', 'no'])]), Column('atrialfib', [InListValidation(['yes', 'no'])]), Column('stroke_pre', [InListValidation(['yes', 'no'])]), Column('tia_pre', [InListValidation(['yes', 'no'])]), Column('ich_pre', [InListValidation(['yes', 'no'])]), Column('treat_antipatelet', [InListValidation(['yes', 'no'])]),
sample_schema = Schema([ Column('host_subject_id', [MatchesPatternValidation(r'\w+', message='Host Subject ID may only contain alphanumeric characters.')]), Column('host_diet', [LeadingWhitespaceValidation()]), Column('source_material_id', [LeadingWhitespaceValidation()]), Column('ethnicity', [CanConvertValidation(str, message='Ethnicity may only contain alphanumeric characters.')]), Column('host_family_relationship', [LeadingWhitespaceValidation()]), Column('host_genotype', [LeadingWhitespaceValidation() | MatchesPatternValidation(r'^[http|www]', message='Host Genotype may only be a valid URL to the associated DbGap project.')]), Column('isolation_source', [LeadingWhitespaceValidation()]), Column('samp_mat_process', [LeadingWhitespaceValidation()]), Column('filename', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the filename column.'), MatchesPatternValidation(r'\w+.[fastq|fasta|fq|raw](.gz)?', message='Filename must be a valid fasta/fastq file with the following supported extensions: .fasta.gz, .fastq.gz, fq.gz')]), Column('sample_id', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the sample_id column.')]), Column('collection_date', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the collection_date column.'), DateFormatValidation('%Y-%m-%d', message='Collection date must be in YYYY-MM-DD date format.')]), Column('subject_tax_id', [InListValidation(['10090', '9606'])]), Column('subject_age', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the subject_age column.'), InRangeValidation(0, 120)]), Column('subject_sex', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the subject_sex column.'), InListValidation(['M', 'm', 'F', 'f'])]), Column('md5_checksum', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the md5_checksum column.'), MatchesPatternValidation(r'[a-zA-Z0-9]{32}', message='MD5 Checksum may only contain 32 alphanumeric characters.')]), Column('host_body_mass_index', [LeadingWhitespaceValidation() | CanConvertValidation(float)]), Column('host_disease', [LeadingWhitespaceValidation() | MatchesPatternValidation(r'DOID:\d+', message='Must provide a valid Disease Ontology ID in format \'DOID:<NUMBERS>\'')]), Column('variable_region', [CustomSeriesValidation(lambda x: ~x.isnull(), '') | MatchesPatternValidation(r'(V[1-9],?)+', message='Variable region must be a valid 16S hypervariable region.')]), Column('gastrointest_disord', [LeadingWhitespaceValidation() | CanConvertValidation(int)]), Column('host_body_product', [LeadingWhitespaceValidation() | MatchesPatternValidation(r'GENEPIO_\d+', message='Must provide a valid Genetic epidemiology ontology ID in format \'GENEPIO_<NUMBERS>\'')]),