Example #1
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column("RA_Report #", [CanConvertValidation(int)]),
             Column("RA_CAERS Created Date", [CanCallValidation(self.parse_date)]),
             Column(
                 "AEC_Event Start Date",
                 [CanCallValidation(self.parse_date)],
                 allow_empty=True,
             ),
             Column(
                 "PRI_Product Role", [InListValidation(["Suspect", "Concomitant"])]
             ),
             Column("PRI_Reported Brand/Product Name"),
             Column("PRI_FDA Industry Code"),
             Column("PRI_FDA Industry Name"),
             Column("CI_Age at Adverse Event"),
             Column(
                 "CI_Age Unit",
                 [
                     InListValidation(
                         ["Year(s)", "Decade(s)", "Month(s)", "Week(s)", "Day(s)"]
                     )
                 ],
             ),
             Column("CI_Gender", [InListValidation(["Female", "Male"])]),
             Column("AEC_One Row Outcomes"),
             Column("SYM_One Row Coded Symptoms"),
         ]
     )
    def create_schema(self) -> Schema:
        """ Create Pandas schema with all the necessary validation rules read in from config """
        col_list = []
        for column in self.__spreadsheet_def.keys():
            validators = [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()
            ]

            mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column)

            # Special cases for checking institutions/countries...
            if column == 'submitting_institution':
                validators.append(
                    InListValidation([i.name for i in self.__institutions]))
            if column == 'country':
                validators.append(
                    InListValidation([i.country for i in self.__institutions]))
            else:
                # Regex validation
                if self.__spreadsheet_def.get_regex(column):
                    validators.append(
                        MatchesPatternValidation(
                            self.__spreadsheet_def.get_regex(column),
                            message=self.__spreadsheet_def.
                            get_regex_validation_message(column)))

                # Validate allowed values
                elif self.__spreadsheet_def.get_allowed_values(column):
                    validators.append(
                        InListValidation(
                            self.__spreadsheet_def.get_allowed_values(column),
                            case_sensitive=False))

                # Field length validation
                max_len = self.__spreadsheet_def.get_max_length(column)
                if max_len and max_len > 0:
                    validators.append(
                        _StringLengthValidation(
                            'field length is greater than {} characters'.
                            format(str(max_len)), max_len))

            # Mandatory field validation
            col_list.append(
                Column(self.__spreadsheet_def.get_column_name(column),
                       validators,
                       allow_empty=not mandatory_field_flag))

        return Schema(col_list)
Example #3
0
def defSchema():
    print('Define expected Schema')
    schema = Schema([
        Column(name='id',
               validations=[IsDtypeValidation(np.object_)],
               allow_empty=False),
        Column(name='comment_text',
               validations=[IsDtypeValidation(np.object_)],
               allow_empty=False),
        Column(name='toxic',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='severe_toxic',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='obscene',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='threat',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='insult',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='identity_hate',
               validations=[InListValidation([0, 1])],
               allow_empty=False)
    ])
    return schema
Example #4
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column(
                 "Given Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "Family Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("Age", [InRangeValidation(0, 120)]),
             Column("Sex", [InListValidation(["Male", "Female", "Other"])]),
             Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]),
         ]
     )
Example #5
0
    def _validate(self, diagnosis_df):
        schema = Schema([
            Column('visit_dt', [
                MatchesPatternValidation(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:00$')
            ]),
            Column('sex', [InListValidation(['M', 'K'])]),
            Column('icd10', [
                MatchesPatternValidation(r'^[CDIJKMNRZ]{1}\d{1,2}.?\d{0,2}$')
            ])
        ])

        errors = schema.validate(diagnosis_df)

        for error in errors:
            self.Logger.error(error)

        if len(errors) > 0:
            exit()
Example #6
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column("id"),
             Column(
                 "payer_name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("document_amount"),
             Column("payed_amount"),
             Column("payer_id_number"),
             Column(
                 "payer_address",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("barcode"),
             Column("typable_line"),
             Column("number"),
             Column(
                 "document_number",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("due_date", [DateFormatValidation("%m/%d/%y")]),
             Column(
                 "city",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "state",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("zip_code"),
             Column("bank_answer_date"),
             Column("pdf_upload_date"),
             Column(
                 "status", [InListValidation(["pending", "paid", "due", "error"])]
             ),
             Column("callback"),
             Column("object_id"),
             Column("extra"),
         ]
     )
Example #7
0
BUILD_MAP = {'28': 'NCBI28',
             '29': 'NCBI29',
             '30': 'NCBI30',
             '31': 'NCBI31',
             '33': 'NCBI33',
             '34': 'NCBI34',
             '35': 'NCBI35',
             '36': 'NCBI36',
             '37': 'GRCh37',
             '38': 'GRCh38'}

VALID_FILE_EXTENSIONS = [".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip"]

GENERIC_VALIDATORS = {
    SNP_DSET: Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=True),
    CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
    BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True),
    EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True),
    OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True),
    HR_DSET: Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=True),
    BETA_DSET: Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET])], allow_empty=True),
    EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=False),
    REF_DSET: Column(REF_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=True),
    FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET])], allow_empty=True),
    LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True)
}

SNP_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
SNP_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=False)

SNP_EMPTY_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
Example #8
0
def clean(spreadsheet, rules, tempfolder, progress=None):

    if not os.path.exists(tempfolder):
        os.makedirs(tempfolder)
    filepath = tempfolder + '/temporary_file.xlsx'

    results = SpreadCheck(filepath)

    ################################
    #   LOAD SPREADSHEET TO DF     #
    ################################
    try:
        df = pd.read_csv(spreadsheet)
    except:
        try:
            df = pd.read_excel(spreadsheet)
        except:
            results.spreadsheet = None
            results.errors.append("ERROR: Unaccepted File Format")
            return results

    ################################
    #   SCHEMAS VALIDATION         #
    ################################
    schemas_list = []

    df.fillna("NULL", inplace=True)

    for field, rule in rules.items():
        schemas_list.append(
            Column(field, [InListValidation(list(rule['allowed_value_set']))]))

    schema = Schema(schemas_list)
    errors = schema.validate(df, columns=schema.get_column_names())

    # reset progress bar
    if progress:
        progress.config(value=0)

    # increment progress bar #1
    if progress:
        progress.step(20)

    ################################
    #   BUILD SHEET W FORMATTING   #
    ################################

    writer = pd.ExcelWriter(filepath, engine='xlsxwriter')

    # Skip row 1 headers so we can add manunally with formatting
    df.to_excel(writer,
                sheet_name='Sheet1',
                startrow=1,
                header=False,
                index=False)

    workbook = writer.book
    worksheet = writer.sheets['Sheet1']

    ### WORKBOOK FORMATS ###
    yellow_highlight = workbook.add_format({'bg_color': '#FFEB9C'})

    header = workbook.add_format({
        'bold': True,
        'text_wrap': True,
        'valign': 'top',
        'fg_color': '#D7E4BC',
        'border': 1
    })

    # increment progress bar #2
    if progress:
        progress.step(20)

    # Set column widths
    worksheet.set_column(0, len(df.columns) - 1, 22)
    worksheet.set_default_row(hide_unused_rows=True)

    # Write the column headers with the defined format.
    for col_num, value in enumerate(df.columns.values):
        worksheet.write(0, col_num, value, header)

    # increment progress bar #3
    if progress:
        progress.step(20)

    flag_rows = []
    df_length = len(df)

    for error in errors:

        # Catch Key errors, like rules_file, data_file column name mismatch
        try:
            row = error.row + 1
            column = df.columns.get_loc(error.column)
        except:
            results.spreadsheet = None
            results.errors.append("ERROR: " + str(error))
            return results

        # Catch other errors. Not sure what yet.
        try:
            # If an autocorrect mapping exists for value, replace it.
            if error.value in rules[error.column]['autocorrect_dict'].keys():
                worksheet.write(
                    row, column,
                    rules[error.column]['autocorrect_dict'][error.value])
                results.corrections += 1
            else:
                flag_rows.append(error.row)
                # If no autocorrect mapping exists, highlight and annotate the entry
                # Comments
                worksheet.write_comment(row, column, error.message)
                # Highlights
                worksheet.conditional_format(row, column, row, column, {
                    'type': 'no_errors',
                    'format': yellow_highlight
                })
                results.flags += 1
        except Exception as err:
            results.spreadsheet = None
            results.errors.append("ERROR: " + err)
            return results

    # increment progress bar #4
    if progress:
        progress.step(20)

    # Hide Rows that don't contain annotations
    for i in range(df_length + 1):
        if i not in flag_rows:
            worksheet.set_row(i + 1, None, None, {'hidden': True})

    # increment progress bar #5
    if progress:
        progress.step(20)

    writer.save()

    results.messages.append("SUCCESS:")
    results.messages.append("{} entries corrected".format(results.corrections))
    results.messages.append("{} entries flagged for review".format(
        results.flags))
    return results
Example #9
0
from collections import defaultdict
import re

from pandas_schema import Column, Schema
from pandas_schema.validation import (LeadingWhitespaceValidation, TrailingWhitespaceValidation, 
                                      CanConvertValidation, MatchesPatternValidation, CustomSeriesValidation,
                                      InRangeValidation, InListValidation, DateFormatValidation)


study_schema = Schema([
    Column('study_id', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the study_id column.') &
                        ~InListValidation([''])]),
    Column('pi_name', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_name column.') &
                        ~InListValidation([''])]),
    Column('sample_type', [InListValidation(['wmgx', 'wmtx', '16S', 'other'])]),
    Column('bioproject_accession', [InListValidation(['']) | MatchesPatternValidation(r'PRJ\w+\d+')]),
    Column('geo_loc_name', [InListValidation(['']) | MatchesPatternValidation(r'\w+:\w+:\w+')]),
    Column('analysis_desc', [InListValidation(['']) | CanConvertValidation(str)]),
    Column('sequencing_facility', [LeadingWhitespaceValidation()]),
    Column('env_biom', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('env_feature', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('env_material', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]),
    Column('host_tissue_sampled', [InListValidation(['']) | MatchesPatternValidation(r'BTO:\d+')]),
    Column('animal_vendor', [LeadingWhitespaceValidation()]),
    Column('paired', [InListValidation(['true', 'false'])]),
    Column('paired_id', [InListValidation(['']) | MatchesPatternValidation(r'[a-zA-Z0-9_.]+')])
    ,
    Column('pi_email', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_email column.') &
                        ~InListValidation([''])])
])
import pandas as pd
from pandas_schema import Column, Schema
from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, InRangeValidation, \
    DateFormatValidation, InListValidation

schema = Schema([
    Column('name',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('title',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('salary', [InRangeValidation(0, 33000)]),
    Column('sex', [InListValidation(['F', 'M'])]),
    Column('date', [DateFormatValidation('%Y-%m-%d')])
])

widths = [
    9,  # name
    19,  # title
    6,  # salary
    4,  # sex
    11,  # date
]

# read source data
test_data = pd.read_fwf("data/fixed_width.txt", widths=widths)
print('orig dataset')
print(test_data)

# data verification
Example #11
0

schema = Schema([
    Column('ID', []),
    Column('Title', []),
    Column('Authors', []),
    Column('Publication Venue', []),
    Column('Publication Date', [DateFormatValidation("%Y-%m-%d")]),
    Column('Abstract', []),
    Column('Link to PDF', []),
    Column('Type', []),
    Column('Keywords', []),
    Column('Tags', []),
    Column('Access', []),
    Column('Area', []),
    Column('Included in map', [InListValidation(["yes", "no"])]),
    Column('Ready for publication?', [InListValidation(["yes", "no"])]),
    Column('Validation', []),
    Column('Comment 1', []),
    Column('Author Comment 1', []),
    Column('Comment 2', []),
    Column('Author Comment 2', []),
    Column('Comment 3', []),
    Column('Author Comment 3', []),
    Column('Comment 4', []),
    Column('Author Comment 4', [])
])


def process_comments(row):
    row = row.tolist()
Example #12
0
import pandas as pd
from io import StringIO
from pandas_schema import Column, Schema
from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, MatchesPatternValidation, InRangeValidation, InListValidation

schema = Schema([
    Column('Given Name',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('Family Name',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('Age', [InRangeValidation(0, 120)]),
    Column('Sex', [InListValidation(['Male', 'Female', 'Other'])]),
    Column('Customer ID', [MatchesPatternValidation(r'\d{4}[A-Z]{4}')])
])

test_data = pd.read_csv(
    StringIO('''Given Name,Family Name,Age,Sex,Customer ID
Gerald ,Hampton,82,Male,2582GABK
Yuuwa,Miyake,270,male,7951WVLW
Edyta,Majewska ,50,Female,775ANSID
'''))

errors = schema.validate(test_data)

for error in errors:
    print(error)
import time

import pandas as pd
from pandas_schema import Column, Schema
from pandas_schema.validation import DateFormatValidation, MatchesPatternValidation, InListValidation

pattern_id = r'^-?\d{1,16}$'  # Number / integer - up to 16
pattern_dec = r'^-?\d*\.\d{1,2}$'
pattern_geo = r'^-?\d*\.\d{1,20}$'  # geo location / decimal with up to 18 decimal place
pattern_date = r'%Y-%m-%d %H:%M:%S'  # Timestamp yyyy-MM-dd HH:mm:ss (in Zulu/UTC time zone) e.g. 2017-07-01 00:00:07

taxiRide_schema = Schema([
    Column('rideId', [MatchesPatternValidation(pattern_id)]),
    Column('isStart', [InListValidation(['START', 'END'])]),
    Column('endTime', [DateFormatValidation(pattern_date)]),
    Column('startTime', [DateFormatValidation(pattern_date)]),
    Column('startLon', [MatchesPatternValidation(pattern_geo)]),
    Column('startLat', [MatchesPatternValidation(pattern_geo)]),
    Column('endLon', [MatchesPatternValidation(pattern_geo)]),
    Column('endLat', [MatchesPatternValidation(pattern_geo)]),
    Column('passengerCnt', [MatchesPatternValidation(pattern_id)])
], ordered=True)


taxiFare_schema = Schema([
    Column('rideId', [MatchesPatternValidation(pattern_id)]),
    Column('taxiId', [MatchesPatternValidation(pattern_id)]),
    Column('driverId', [MatchesPatternValidation(pattern_id)]),
    Column('startTime', [DateFormatValidation(pattern_date)]),
    Column('paymentType', [InListValidation(['CSH', 'CRD', 'NOC', 'DIS', 'UNK'])]),
    Column('tip', [MatchesPatternValidation(pattern_dec)]),
Example #14
0
    'atrialfib',
    # ATCDs
    'stroke_pre',
    'tia_pre',
    'ich_pre',
    # Treatment
    'treat_antipatelet',
    'treat_anticoagulant',
    'ivt_start',  # needed to check if started before imaging
    'treat_ivt',
    'iat_start',  # needed to check if started before imaging
]

validation_schema = Schema([
    Column('age', [InRangeValidation(1, 120)]),
    Column('sex', [InListValidation(['m', 'f'])]),
    Column('height', [InRangeValidation(50, 300)]),
    Column('weight', [InRangeValidation(10, 400)]),
    Column('onset_known', [InListValidation(['yes', 'no', 'wake_up'])]),
    Column('Firstimage_date', [DateFormatValidation('%Y-%m-%d %H:%M:%S')]),
    Column('onset_time', [DateFormatValidation('%Y-%m-%d %H:%M:%S')]),
    Column('NIH admission', [InRangeValidation(0, 43)]),
    Column('bp_syst', [InRangeValidation(0, 300)]),
    Column('bp_diast', [InRangeValidation(0, 300)]),
    Column('glucose', [InRangeValidation(0.1, 30)]),
    Column('créatinine', [InRangeValidation(0.1, 1000)]),
    Column('hypertension', [InListValidation(['yes', 'no'])]),
    Column('diabetes', [InListValidation(['yes', 'no'])]),
    Column('hyperlipidemia', [InListValidation(['yes', 'no'])]),
    Column('smoking', [InListValidation(['yes', 'no'])]),
    Column('atrialfib', [InListValidation(['yes', 'no'])]),
Example #15
0
def validate(df):
    d_error = {}
    list_bu = [x[0] for x in BU_CHOICES]
    list_rd = [x[0] for x in RD_CHOICES]
    list_dept = [x[0] for x in DEPT_CHOICES]
    list_hplevel = [x[0] for x in HPLEVEL_CHOICES]
    list_province = [x[0] for x in PROVINCE_CHOICES]
    list_title = [x[0] for x in TITLE_CHOICES]

    NullValidation = CustomElementValidation(lambda d: d is not np.nan,
                                             "该字段不能为空")
    schema = Schema([
        Column("南北中国", [InListValidation(list_bu)]),
        Column("区域", [InListValidation(list_rd)]),
        Column("大区", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("地区经理", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("负责代表", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column(
            "医院编码",
            [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation(),
                NullValidation,
                MatchesPatternValidation(r"^[H]{1}(\d){9}$"),
            ],
        ),
        Column("医院全称", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(), NullValidation
        ]),
        Column("省/自治区/直辖市", [InListValidation(list_province)]),
        Column("是否双call", [InListValidation(["是", "否"])]),
        Column("医院级别", [InListValidation(list_hplevel)]),
        Column("开户进展", [InListValidation(["已开户", "未开户"])]),
        Column("客户姓名", [
            LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation(),
            IsDistinctValidation()
        ]),
        Column("所在科室", [InListValidation(list_dept)]),
        Column("职称", [InListValidation(list_title)]),
        Column("月出诊次数(半天计)",
               [CanConvertValidation(int),
                InRangeValidation(0, 63)]),
        Column("每半天\n门诊量", [CanConvertValidation(int),
                            InRangeValidation(0, )]),
        Column("相关病人\n比例(%)\n建议比例:40%-80%",
               [CanConvertValidation(int),
                InRangeValidation(0, 101)]),
        Column("备注"),
    ])
    errors = schema.validate(df.loc[:, COL])
    for error in errors:
        str_warning = str(error)
        for term in D_TRANSLATE:
            str_warning = str_warning.replace(term, D_TRANSLATE[term])
            findword = r": [0-9]\d*"
            str_warning = re.sub(findword, row_refined, str_warning)
        d_error[str_warning] = "<br>"

    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")}
    d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")}
    d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")}
    d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")}
    d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")}

    d_error = {**d_error, **check_hplevel_with_dept(df)}  # 检查医院级别和所在科室是否出现矛盾
    return d_error
Example #16
0
    Column("Chr15", [CanConvertValidation(float)]),
    Column("Chr16", [CanConvertValidation(float)]),
    Column("Chr17", [CanConvertValidation(float)]),
    Column("Chr18", [CanConvertValidation(float)]),
    Column("Chr19", [CanConvertValidation(float)]),
    Column("Chr20", [CanConvertValidation(float)]),
    Column("Chr21", [CanConvertValidation(float)]),
    Column("Chr22", [CanConvertValidation(float)]),
    Column("ChrX", [CanConvertValidation(float)]),
    Column("ChrY", [CanConvertValidation(float)]),
    Column("FF_Formatted", [CanConvertValidation(float)]),
    Column("FFY", [CanConvertValidation(float)]),
    Column("FFX", [CanConvertValidation(float)]),
    Column("DuplicationRate", [CanConvertValidation(float)]),
    Column("Bin2BinVariance", [CanConvertValidation(float)]),
    Column("UnfilteredCNVcalls", [CanConvertValidation(float)]),
    Column("CNVSegment", [InListValidation(["Found"])]),
    Column("Flowcell", []),
    Column("SequencingDate", []),
    Column("Median_13", [CanConvertValidation(float)]),
    Column("Median_18", [CanConvertValidation(float)]),
    Column("Median_21", [CanConvertValidation(float)]),
    Column("Median_X", [CanConvertValidation(float)]),
    Column("Median_Y", [CanConvertValidation(float)]),
    Column("Stdev_13", [CanConvertValidation(float)]),
    Column("Stdev_18", [CanConvertValidation(float)]),
    Column("Stdev_21", [CanConvertValidation(float)]),
    Column("Stdev_X", [CanConvertValidation(float)]),
    Column("Stdev_Y", [CanConvertValidation(float)]),
])
Example #17
0
def main():
    # Parse input arguments
    parser = get_parser()
    args = parser.parse_args()

    data_path = args.path_in

    path_tsv = os.path.join(data_path, 'participants.tsv')
    tsv_file = pd.read_csv(path_tsv, sep='\t')
    list_subj = [
        name for name in os.listdir(data_path) if
        os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub')
    ]
    df = pd.DataFrame(tsv_file)
    list_tsv_participants = df['participant_id'].tolist()
    missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants))
    missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj))

    if missing_subjects_tsv:
        # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv)
        print('\nWarning missing following subjects from participants.tsv: ')
        missing_subjects_tsv.sort()
        pprint(missing_subjects_tsv)
    if missing_subjects_folder:
        # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder)
        print(
            '\nWarning missing data for subjects listed in participants.tsv: ')
        missing_subjects_folder.sort()
        pprint(missing_subjects_folder)

    for dirName, subdirList, fileList in os.walk(data_path):
        for file in fileList:
            if file.endswith('.nii.gz'):
                originalFilePath = os.path.join(dirName, file)
                jsonSidecarPath = os.path.join(dirName,
                                               file.split(".")[0] + '.json')
                if os.path.exists(jsonSidecarPath) == False:
                    print("Missing jsonSidecar: " + jsonSidecarPath)

    # Checking participants.tsv contents
    schema = Schema([
        Column('participant_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('sex', [InListValidation(['M', 'F'])]),
        Column('age', [InRangeValidation(18, 60)]),
        Column('height', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('weight', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('date_of_scan', [
            DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-")
        ]),
        Column('institution_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('institution',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturer',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturers_model_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('receive_coil_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('software_versions',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('researcher',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
    ])

    errors = schema.validate(tsv_file)
    print('\nChecking the contents of participants.tsv')
    if not errors:
        print("--> all good 👍")
    else:
        for error in errors:
            print(error)
 Column(PVAL_DSET, [
     CanConvertValidation(DSET_TYPES[PVAL_DSET]),
     InInclusiveRangeValidation(0, 1)
 ],
        allow_empty=False),
 BETA_DSET:
 Column(BETA_DSET, [CanConvertValidation(float)], allow_empty=True),
 SNP_DSET:
 Column(SNP_DSET, [
     CanConvertValidation(DSET_TYPES[SNP_DSET]),
     MatchesPatternValidation(
         r'^chr[0-9XY]+_[0-9]+_[ACTGNactgn]+_[ACTGNactgn]+|LONG_STRING$')
 ],
        allow_empty=True),
 CHR_DSET:
 Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False),
 BP_DSET:
 Column(BP_DSET, [
     CanConvertValidation(DSET_TYPES[BP_DSET]),
     InInclusiveRangeValidation(1, 999999999)
 ],
        allow_empty=False),
 EFFECT_DSET:
 Column(EFFECT_DSET,
        [MatchesPatternValidation(r'^[ACTGNactgn]+|LONG_STRING$')],
        allow_empty=True),
 OTHER_DSET:
 Column(OTHER_DSET,
        [MatchesPatternValidation(r'^[ACTGNactgn]+|LONG_STRING$')],
        allow_empty=True),
 FREQ_DSET:
    def validate_and_annotate(self, file_path=''):
        ################################
        #   SCHEMAS VALIDATION    #
        ################################

        # FIX ME: this is a really janky way to do this. Passing empty string for file_path...
        # is a temp fix for review_changes re-validate data button causing an infinite loop when calling this with an argument :(
        if not file_path:
            file_path = self.dirname + '/for_review/intermediate_hospital_data.xlsx'

        try:
            df = pd.read_csv(file_path)
        except:
            try:
                df = pd.read_excel(file_path)
            except:
                print("UNACCEPTED FILE FORMAT")

        df.fillna("NULL", inplace=True)

        #~ print(file_path)
        #~ print(df.info)

        schema = Schema([
            Column('N95PlanFitTested', [InListValidation(['Y', 'N', 'NULL'])]),
            Column('PARPsPlanTrained', [InListValidation(['Y', 'N', 'NULL'])]),
        ])

        errors = schema.validate(df, columns=schema.get_column_names())

        #######################################
        # Build excel worksheet w formatting
        #######################################
        save_path2 = self.dirname + '/for_review/'
        if not os.path.exists(save_path2):
            os.makedirs(save_path2)

        writer = pd.ExcelWriter(save_path2 + 'intermediate_hospital_data.xlsx',
                                engine='xlsxwriter')

        # Skip row 1 headers so we can add manunally with formatting
        df.to_excel(writer,
                    sheet_name='Sheet1',
                    startrow=1,
                    header=False,
                    index=False)

        workbook = writer.book
        worksheet = writer.sheets['Sheet1']

        ### WORKBOOK FORMATS ###
        yellow_highlight = workbook.add_format({'bg_color': '#FFEB9C'})

        header = workbook.add_format({
            'bold': True,
            'text_wrap': True,
            'valign': 'top',
            'fg_color': '#D7E4BC',
            'border': 1
        })
        ########################

        # Set column widths
        worksheet.set_column('A:II', 30)
        worksheet.set_default_row(hide_unused_rows=True)

        # Write the column headers with the defined format.
        for col_num, value in enumerate(df.columns.values):
            worksheet.write(0, col_num, value, header)

        # for storing error row numbers while we iterate thru error object
        # will use for hiding rows
        error_rows = []
        df_length = len(df)

        for error in errors:

            error_rows.append(error.row)

            row = error.row + 1
            column = df.columns.get_loc(error.column)

            # Comments
            worksheet.write_comment(row, column, error.message)

            # Highlights
            worksheet.conditional_format(row, column, row, column, {
                'type': 'no_errors',
                'format': yellow_highlight
            })

        #~ print(error_rows);

        # Hide Rows that don't contain errors
        for i in range(df_length + 1):
            if i not in error_rows:
                worksheet.set_row(i + 1, None, None, {'hidden': True})

        writer.save()

        # Pop up
        self.review_changes()
Example #20
0
    'smoking',
    'atrialfib',
    # ATCDs
    'stroke_pre',
    'tia_pre',
    'ich_pre',
    # general Treatment
    'treat_antipatelet',
    'treat_anticoagulant',
    # acute treatment
    'treat_ivt_before_ct',  # ie ivt was started before imaging
]

validation_schema = Schema([
    Column('age', [InRangeValidation(1, 120)]),
    Column('sex', [InListValidation([0, 1])]),
    Column('height', [InRangeValidation(50, 300)]),
    Column('weight', [InRangeValidation(10, 400)]),
    Column('onset_known_yes', [InListValidation([0, 1])]),
    Column('onset_known_wake_up', [InListValidation([0, 1])]),
    Column('onset_to_ct',
           [InRangeValidation(0, 43800)]),  # onset to CT time max is 1 month
    Column('NIH admission', [InRangeValidation(0, 43)]),
    Column('bp_syst', [InRangeValidation(0, 300)]),
    Column('bp_diast', [InRangeValidation(0, 300)]),
    Column('glucose', [InRangeValidation(0.1, 30)]),
    Column('créatinine', [InRangeValidation(0.1, 1000)]),
    Column('hypertension', [InListValidation([0, 1])]),
    Column('diabetes', [InListValidation([0, 1])]),
    Column('hyperlipidemia', [InListValidation([0, 1])]),
    Column('smoking', [InListValidation([0, 1])]),