Beispiel #1
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column("id"),
             Column(
                 "payer_name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("document_amount"),
             Column("payed_amount"),
             Column("payer_id_number"),
             Column(
                 "payer_address",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("barcode"),
             Column("typable_line"),
             Column("number"),
             Column(
                 "document_number",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("due_date", [DateFormatValidation("%m/%d/%y")]),
             Column(
                 "city",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "state",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("zip_code"),
             Column("bank_answer_date"),
             Column("pdf_upload_date"),
             Column(
                 "status", [InListValidation(["pending", "paid", "due", "error"])]
             ),
             Column("callback"),
             Column("object_id"),
             Column("extra"),
         ]
     )
import pandas as pd
from pandas_schema import Column, Schema
from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, InRangeValidation, \
    DateFormatValidation, InListValidation

schema = Schema([
    Column('name',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('title',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('salary', [InRangeValidation(0, 33000)]),
    Column('sex', [InListValidation(['F', 'M'])]),
    Column('date', [DateFormatValidation('%Y-%m-%d')])
])

widths = [
    9,  # name
    19,  # title
    6,  # salary
    4,  # sex
    11,  # date
]

# read source data
test_data = pd.read_fwf("data/fixed_width.txt", widths=widths)
print('orig dataset')
print(test_data)

# data verification
Beispiel #3
0
def main():
    # Parse input arguments
    parser = get_parser()
    args = parser.parse_args()

    data_path = args.path_in

    path_tsv = os.path.join(data_path, 'participants.tsv')
    tsv_file = pd.read_csv(path_tsv, sep='\t')
    list_subj = [
        name for name in os.listdir(data_path) if
        os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub')
    ]
    df = pd.DataFrame(tsv_file)
    list_tsv_participants = df['participant_id'].tolist()
    missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants))
    missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj))

    if missing_subjects_tsv:
        # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv)
        print('\nWarning missing following subjects from participants.tsv: ')
        missing_subjects_tsv.sort()
        pprint(missing_subjects_tsv)
    if missing_subjects_folder:
        # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder)
        print(
            '\nWarning missing data for subjects listed in participants.tsv: ')
        missing_subjects_folder.sort()
        pprint(missing_subjects_folder)

    for dirName, subdirList, fileList in os.walk(data_path):
        for file in fileList:
            if file.endswith('.nii.gz'):
                originalFilePath = os.path.join(dirName, file)
                jsonSidecarPath = os.path.join(dirName,
                                               file.split(".")[0] + '.json')
                if os.path.exists(jsonSidecarPath) == False:
                    print("Missing jsonSidecar: " + jsonSidecarPath)

    # Checking participants.tsv contents
    schema = Schema([
        Column('participant_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('sex', [InListValidation(['M', 'F'])]),
        Column('age', [InRangeValidation(18, 60)]),
        Column('height', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('weight', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('date_of_scan', [
            DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-")
        ]),
        Column('institution_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('institution',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturer',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturers_model_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('receive_coil_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('software_versions',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('researcher',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
    ])

    errors = schema.validate(tsv_file)
    print('\nChecking the contents of participants.tsv')
    if not errors:
        print("--> all good 👍")
    else:
        for error in errors:
            print(error)
schema = Schema([
    Column(
        'key',
        [MatchesPatternValidation(pattern_id)]),  # Number / integer - up to 16
    Column(
        'sensor_id',
        [MatchesPatternValidation(pattern_id)]),  # Number / integer - up to 16
    Column(
        'location',
        [MatchesPatternValidation(pattern_id)]),  # Number / integer - up to 16
    Column('lat', [MatchesPatternValidation(pattern_geo)
                   ]),  # Number / decimal with up to 16 decimal place
    Column('lon', [MatchesPatternValidation(pattern_geo)
                   ]),  # Number / decimal with up to 16 decimal place
    Column('timestamp', [DateFormatValidation('%Y-%m-%dT%H:%M:%S')]),
    # Timestamp yyyy-MM-dd'T'HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01T00:00:07
    Column('pressure', [MatchesPatternValidation(pattern_dec)]),
    # Numbers / / decimal with 1 or 2 decimals (.00)
    Column('temperature', [
        InRangeValidation(-146, 60),
        MatchesPatternValidation(r'^-?\d*\.\d{1,2}$')
    ]),
    # Number / decimal with upto 2 decimal place
    Column('humidity', [MatchesPatternValidation(pattern_dec)]
           )  # Numbers with 1 or 2 decimals (.00)
])

### get data from File
test_data = pd.read_csv("data/testCSV_short.csv")
# 1         ,2266      ,1140      ,42.738 ,23.272    ,2017-07-01T00:00:07 ,95270.27 ,23.46        ,62.48

start_time = time.time()

pattern_id = r'^-?\d{1,16}$'
pattern_dec = r'^-?\d*\.\d{1,2}$'
pattern_geo = r'^-?\d*\.\d{1,16}$'


schema = Schema([
    Column('key', [MatchesPatternValidation(pattern_id)]),            # Number / integer - up to 16
    Column('sensor_id', [MatchesPatternValidation(pattern_id)]),      # Number / integer - up to 16
    Column('location', [MatchesPatternValidation(pattern_id)]),       # Number / integer - up to 16
    Column('lat', [MatchesPatternValidation(pattern_geo)]),       # Number / decimal with up to 16 decimal place
    Column('lon', [MatchesPatternValidation(pattern_geo)]),       # Number / decimal with up to 16 decimal place
    Column('timestamp', [DateFormatValidation('%Y-%m-%dT%H:%M:%S')]),      # Timestamp yyyy-MM-dd'T'HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01T00:00:07
    Column('pressure', [MatchesPatternValidation(pattern_dec)]),   # Numbers / / decimal with 1 or 2 decimals (.00)
    Column('temperature', [InRangeValidation(-146, 60), MatchesPatternValidation(r'^-?\d*\.\d{1,2}$')]),  # Number / decimal with upto 2 decimal place
    Column('humidity', [MatchesPatternValidation(pattern_dec)])    # Numbers with 1 or 2 decimals (.00)
])

### get data from File
print('load orig dataset from file')

test_data = pd.read_csv("data/testCSV_short.csv")
print('orig dataset')
print(test_data)

# data verification
print('start data verification on orig dataset')
Beispiel #6
0
            store.delete(key + "_output")
            break
    return result


additional_context_fields = [
    "Project name", "Project website", "Topic", "Main curator name",
    "Main curator e-mail"
]

schema = Schema([
    Column('ID', []),
    Column('Title', []),
    Column('Authors', []),
    Column('Publication Venue', []),
    Column('Publication Date', [DateFormatValidation("%Y-%m-%d")]),
    Column('Abstract', []),
    Column('Link to PDF', []),
    Column('Type', []),
    Column('Keywords', []),
    Column('Tags', []),
    Column('Access', []),
    Column('Area', []),
    Column('Comment 1', []),
    Column('Author Comment 1', []),
    Column('Comment 2', []),
    Column('Author Comment 2', []),
    Column('Comment 3', []),
    Column('Author Comment 3', []),
    Column('Comment 4', []),
    Column('Author Comment 4', [])
import pandas as pd
from pandas_schema import Column, Schema
from pandas_schema.validation import \
    DateFormatValidation, \
    LeadingWhitespaceValidation, \
    TrailingWhitespaceValidation, \
    CustomSeriesValidation, \
    CanCallValidation, \
    CanConvertValidation

URL_ALL = "https://raw.githubusercontent.com/tryggvigy/CoronaWatchIS/master/data/covid_in_is_all.cvs"

date_validator = Column('date', [
    LeadingWhitespaceValidation(),
    TrailingWhitespaceValidation(),
    DateFormatValidation("%Y-%m-%d") & CustomSeriesValidation(
        lambda x: x.is_monotonic_increasing and x.is_unique,
        'date is not monotonic')
])

default_value_validators = [
    LeadingWhitespaceValidation(),
    TrailingWhitespaceValidation()
]

schemas_by_key = {
    'cases':
    Schema([
        date_validator,
        Column('cases', [
            *default_value_validators,
import time

import pandas as pd
from pandas_schema import Column, Schema
from pandas_schema.validation import DateFormatValidation, MatchesPatternValidation, InListValidation

pattern_id = r'^-?\d{1,16}$'  # Number / integer - up to 16
pattern_dec = r'^-?\d*\.\d{1,2}$'
pattern_geo = r'^-?\d*\.\d{1,20}$'  # geo location / decimal with up to 18 decimal place
pattern_date = r'%Y-%m-%d %H:%M:%S'  # Timestamp yyyy-MM-dd HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01 00:00:07

taxiRide_schema = Schema([
    Column('rideId', [MatchesPatternValidation(pattern_id)]),
    Column('isStart', [InListValidation(['START', 'END'])]),
    Column('endTime', [DateFormatValidation(pattern_date)]),
    Column('startTime', [DateFormatValidation(pattern_date)]),
    Column('startLon', [MatchesPatternValidation(pattern_geo)]),
    Column('startLat', [MatchesPatternValidation(pattern_geo)]),
    Column('endLon', [MatchesPatternValidation(pattern_geo)]),
    Column('endLat', [MatchesPatternValidation(pattern_geo)]),
    Column('passengerCnt', [MatchesPatternValidation(pattern_id)])
], ordered=True)


taxiFare_schema = Schema([
    Column('rideId', [MatchesPatternValidation(pattern_id)]),
    Column('taxiId', [MatchesPatternValidation(pattern_id)]),
    Column('driverId', [MatchesPatternValidation(pattern_id)]),
    Column('startTime', [DateFormatValidation(pattern_date)]),
    Column('paymentType', [InListValidation(['CSH', 'CRD', 'NOC', 'DIS', 'UNK'])]),
    Column('tip', [MatchesPatternValidation(pattern_dec)]),
Beispiel #9
0
    'ich_pre',
    # Treatment
    'treat_antipatelet',
    'treat_anticoagulant',
    'ivt_start',  # needed to check if started before imaging
    'treat_ivt',
    'iat_start',  # needed to check if started before imaging
]

validation_schema = Schema([
    Column('age', [InRangeValidation(1, 120)]),
    Column('sex', [InListValidation(['m', 'f'])]),
    Column('height', [InRangeValidation(50, 300)]),
    Column('weight', [InRangeValidation(10, 400)]),
    Column('onset_known', [InListValidation(['yes', 'no', 'wake_up'])]),
    Column('Firstimage_date', [DateFormatValidation('%Y-%m-%d %H:%M:%S')]),
    Column('onset_time', [DateFormatValidation('%Y-%m-%d %H:%M:%S')]),
    Column('NIH admission', [InRangeValidation(0, 43)]),
    Column('bp_syst', [InRangeValidation(0, 300)]),
    Column('bp_diast', [InRangeValidation(0, 300)]),
    Column('glucose', [InRangeValidation(0.1, 30)]),
    Column('créatinine', [InRangeValidation(0.1, 1000)]),
    Column('hypertension', [InListValidation(['yes', 'no'])]),
    Column('diabetes', [InListValidation(['yes', 'no'])]),
    Column('hyperlipidemia', [InListValidation(['yes', 'no'])]),
    Column('smoking', [InListValidation(['yes', 'no'])]),
    Column('atrialfib', [InListValidation(['yes', 'no'])]),
    Column('stroke_pre', [InListValidation(['yes', 'no'])]),
    Column('tia_pre', [InListValidation(['yes', 'no'])]),
    Column('ich_pre', [InListValidation(['yes', 'no'])]),
    Column('treat_antipatelet', [InListValidation(['yes', 'no'])]),
Beispiel #10
0
sample_schema = Schema([
    Column('host_subject_id', [MatchesPatternValidation(r'\w+', message='Host Subject ID may only contain alphanumeric characters.')]),
    Column('host_diet', [LeadingWhitespaceValidation()]),
    Column('source_material_id', [LeadingWhitespaceValidation()]),
    Column('ethnicity', [CanConvertValidation(str, message='Ethnicity may only contain alphanumeric characters.')]),
    Column('host_family_relationship', [LeadingWhitespaceValidation()]),
    Column('host_genotype', [LeadingWhitespaceValidation() |
                             MatchesPatternValidation(r'^[http|www]', message='Host Genotype may only be a valid URL to the associated DbGap project.')]),
    Column('isolation_source', [LeadingWhitespaceValidation()]), 
    Column('samp_mat_process', [LeadingWhitespaceValidation()]),
    Column('filename', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the filename column.'),
                        MatchesPatternValidation(r'\w+.[fastq|fasta|fq|raw](.gz)?', message='Filename must be a valid fasta/fastq file with the following supported extensions: .fasta.gz, .fastq.gz, fq.gz')]),
    Column('sample_id', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the sample_id column.')]),
    Column('collection_date', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the collection_date column.'),
                               DateFormatValidation('%Y-%m-%d', message='Collection date must be in YYYY-MM-DD date format.')]),
    Column('subject_tax_id', [InListValidation(['10090', '9606'])]),
    Column('subject_age', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the subject_age column.'),
                           InRangeValidation(0, 120)]),
    Column('subject_sex', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the subject_sex column.'),
                           InListValidation(['M', 'm', 'F', 'f'])]),
    Column('md5_checksum', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the md5_checksum column.'),
                            MatchesPatternValidation(r'[a-zA-Z0-9]{32}', message='MD5 Checksum may only contain 32 alphanumeric characters.')]),
    Column('host_body_mass_index', [LeadingWhitespaceValidation() | CanConvertValidation(float)]),
    Column('host_disease', [LeadingWhitespaceValidation() | 
                            MatchesPatternValidation(r'DOID:\d+', message='Must provide a valid Disease Ontology ID in format \'DOID:<NUMBERS>\'')]),
    Column('variable_region', [CustomSeriesValidation(lambda x: ~x.isnull(), '') |
                               MatchesPatternValidation(r'(V[1-9],?)+', message='Variable region must be a valid 16S hypervariable region.')]),
    Column('gastrointest_disord', [LeadingWhitespaceValidation() | CanConvertValidation(int)]),
    Column('host_body_product', [LeadingWhitespaceValidation() |
                                 MatchesPatternValidation(r'GENEPIO_\d+', message='Must provide a valid Genetic epidemiology ontology ID in format \'GENEPIO_<NUMBERS>\'')]),