def validate_variant(conn, args, filepath): """Validates input file for variant data This function validates that the contents of a file to contain variant data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ schema = Schema([ Column('chr', [ CanConvertValidation(int) ]), Column('pos', [ CanConvertValidation(int), IsDistinctValidation() ]) ]) df = pd.read_csv(filepath, sep='\t', header=None) if len(df.columns) != 2: raise Exception(f"Invalid file format. Excepted 2 columns, found {len(df.columns)} columns. Columns should consist of chromsome number and SNP position. Filepath: {filepath}") df.columns = ['chr', 'pos'] err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def validate_results(conn, args, filepath): """Validates input file for GWAS result data This function validates that the contents of a file to contain GWAS result data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) # For each column, add it to the schema, and then for known ones, add the # schema validation. Use fuzzy comparisons when possible schema_columns = [] for col in df.columns: validators = [] if re.match("(SNP)|(chr)|(pos)|(nSNPs)", col, re.IGNORECASE): validators.append(CanConvertValidation(int)) # Look for any of the p-values and make sure that they can be cast as a float if re.match("((null)?pval(ue)?)", col, re.IGNORECASE): validators.append(CanConvertValidation(float)) schema_columns.append(Column(col, validators)) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def validate_genotype(conn, args, filepath): """Validates input file for genotype data This function validates that the contents of a file to contain genotype data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ # Allow for users to skip this validation step because it is time consuming if args.skip_genotype_validation is True: return schema_columns = [ Column('row_number', [ CanConvertValidation(int) & IsDistinctValidation() ]) ] # Get the number of lines from the .pos counterpart file pos_filepath = '.'.join([filepath, 'pos']) if not os.path.exists(pos_filepath): raise FileNotFoundError(f"Count not locate the position counterpart file for {filepath}") nPositions = len(pd.read_csv(pos_filepath, header=None).index) for n in range(nPositions): schema_columns.append( Column(f'pos_{n}', [ CanConvertValidation(int) & CustomSeriesValidation(lambda x: x.int in [-1,0,1,2], 'Incorrectly coded value.') ]) ) schema = Schema(schema_columns) df = pd.read_csv(filepath, sep='\t', header=None) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def __init__(self): self.schemas = Schema( [ Column("RA_Report #", [CanConvertValidation(int)]), Column("RA_CAERS Created Date", [CanCallValidation(self.parse_date)]), Column( "AEC_Event Start Date", [CanCallValidation(self.parse_date)], allow_empty=True, ), Column( "PRI_Product Role", [InListValidation(["Suspect", "Concomitant"])] ), Column("PRI_Reported Brand/Product Name"), Column("PRI_FDA Industry Code"), Column("PRI_FDA Industry Name"), Column("CI_Age at Adverse Event"), Column( "CI_Age Unit", [ InListValidation( ["Year(s)", "Decade(s)", "Month(s)", "Week(s)", "Day(s)"] ) ], ), Column("CI_Gender", [InListValidation(["Female", "Male"])]), Column("AEC_One Row Outcomes"), Column("SYM_One Row Coded Symptoms"), ] )
def compile_field_validator(self, field): field_validator = [] if field['source_field_type'].lower() == 'int': field_validator.append(CanConvertValidation(int)) elif field['source_field_type'].lower() == 'float': field_validator.append(CanConvertValidation(float)) if pd.notnull(field['min']) and pd.notnull(field['max']): field_validator.append( InRangeValidation(field['min'], field['max'])) elif pd.notnull(field['min']): field_validator.append(InRangeValidation(field['min'], math.inf)) elif pd.notnull(field['max']): field_validator.append(InRangeValidation(-math.inf, field['max'])) return field_validator
class AllowEmptyColumn(unittest.TestCase): """ Test a column with one single validation that allows empty columns """ NAME = 'col1' col = Column(NAME, [CanConvertValidation(int)], allow_empty=True) ser = pd.Series([ '', ]) def test_outputs(self): results = self.col.validate(self.ser) self.assertEqual(len(results), 0, 'allow_empty is not allowing empty columns')
def validate_phenotype(conn, args, filepath): """Validates input file for phenotype data This function validates that the contents of a file to contain phenotype data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) nrows, ncols = df.shape nrows += 1 # include the header in the row count if re.match('(genotype)|(pedigree)|(line)', df.columns[0], re.IGNORECASE) is None: raise Exception("Genotype/pedigree/line should be the first column in the phenotype file") # Rename the first column of data to be the genotypes/lines df.rename(columns={f'{df.columns[0]}': 'genotype'}, inplace=True) schema_columns = [ Column('genotype', [ IsDistinctValidation() ]) ] for n in range(1, ncols): schema_columns.append( Column(df.columns[n], [ # NOTE(tparker): This may not always be true. If there any phenotypes that # are listed as categories or strings, then this would fail # Find out all the possible phenotype values. It may be difficult to # validate input data without a user-provided dtype list CanConvertValidation(float) ]) ) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
class SingleValidationColumn(unittest.TestCase): """ Test a column with one single validation """ NAME = 'col1' col = Column(NAME, [CanConvertValidation(int)], allow_empty=False) ser = pd.Series(['a', 'b', 'c']) def test_name(self): self.assertEqual(self.col.name, self.NAME, 'A Column does not store its name correctly') def test_outputs(self): results = self.col.validate(self.ser) self.assertEqual(len(results), len(self.ser), 'A Column produces the wrong number of errors') for i in range(2): self.assertTrue(any([r.row == i for r in results]), 'A Column does not report errors for every row')
def validate_population_structure(conn, args, filepath): """Validates input file for population structure data This function validates that the contents of a file to contain population structure data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) nrows, ncols = df.shape nrows += 1 # include the header rows in the count logging.debug(f'Population structure columns: {df.columns}') logging.debug(f"Population structure dimensions: <{nrows}, {ncols}>") schema_columns = [ Column('Pedigree', [ IsDistinctValidation() ]) ] for n in range(1, ncols): schema_columns.append(Column(df.columns[n], [ CanConvertValidation(float) ])) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
def validate_kinship(conn, args, filepath): """Validates input file for kinship data This function validates that the contents of a file to contain kinship data. If an error is encountered, throw an exception. Args: conn (psycopg2.extensions.connection): psycopg2 connection args (ArgumentParser namespace): user-defined arguments filepath (str): location of input file """ df = pd.read_csv(filepath) nrows, ncols = df.shape df.rename(columns = {"Unnamed: 0": "line_name"}, inplace=True) # since column name is blank by default, rename it for later reference nrows += 1 # include the header row in the count logging.debug(f"Dimensions of kinship matrix: <{nrows}, {ncols}>") schema_columns = [ Column('line_name', [ IsDistinctValidation() ]) ] for n in range(1, ncols): schema_columns.append(Column(df.columns[n], [ CanConvertValidation(float) ])) schema = Schema(schema_columns) err = schema.validate(df) if err: for e in err: logging.error(f"Error encountered while validating: {filepath}") raise Exception(e)
BUILD_MAP = {'28': 'NCBI28', '29': 'NCBI29', '30': 'NCBI30', '31': 'NCBI31', '33': 'NCBI33', '34': 'NCBI34', '35': 'NCBI35', '36': 'NCBI36', '37': 'GRCh37', '38': 'GRCh38'} VALID_FILE_EXTENSIONS = [".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip"] GENERIC_VALIDATORS = { SNP_DSET: Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=True), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True), EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True), OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True), HR_DSET: Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=True), BETA_DSET: Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET])], allow_empty=True), EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=False), REF_DSET: Column(REF_DSET, [MatchesPatternValidation(r'^[ACTGN]+$')], allow_empty=True), FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET])], allow_empty=True), LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation()], allow_empty=True) } SNP_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()} SNP_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$')], allow_empty=False)
def validate(df): d_error = {} list_bu = [x[0] for x in BU_CHOICES] list_rd = [x[0] for x in RD_CHOICES] list_dept = [x[0] for x in DEPT_CHOICES] list_hplevel = [x[0] for x in HPLEVEL_CHOICES] list_province = [x[0] for x in PROVINCE_CHOICES] list_title = [x[0] for x in TITLE_CHOICES] NullValidation = CustomElementValidation(lambda d: d is not np.nan, "该字段不能为空") schema = Schema([ Column("南北中国", [InListValidation(list_bu)]), Column("区域", [InListValidation(list_rd)]), Column("大区", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("地区经理", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("负责代表", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column( "医院编码", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation, MatchesPatternValidation(r"^[H]{1}(\d){9}$"), ], ), Column("医院全称", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), NullValidation ]), Column("省/自治区/直辖市", [InListValidation(list_province)]), Column("是否双call", [InListValidation(["是", "否"])]), Column("医院级别", [InListValidation(list_hplevel)]), Column("开户进展", [InListValidation(["已开户", "未开户"])]), Column("客户姓名", [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), IsDistinctValidation() ]), Column("所在科室", [InListValidation(list_dept)]), Column("职称", [InListValidation(list_title)]), Column("月出诊次数(半天计)", [CanConvertValidation(int), InRangeValidation(0, 63)]), Column("每半天\n门诊量", [CanConvertValidation(int), InRangeValidation(0, )]), Column("相关病人\n比例(%)\n建议比例:40%-80%", [CanConvertValidation(int), InRangeValidation(0, 101)]), Column("备注"), ]) errors = schema.validate(df.loc[:, COL]) for error in errors: str_warning = str(error) for term in D_TRANSLATE: str_warning = str_warning.replace(term, D_TRANSLATE[term]) findword = r": [0-9]\d*" str_warning = re.sub(findword, row_refined, str_warning) d_error[str_warning] = "<br>" d_error = {**d_error, **check_inconsist(df, "医院编码", "医院全称", "both")} d_error = {**d_error, **check_inconsist(df, "区域", "大区", "right")} d_error = {**d_error, **check_inconsist(df, "大区", "地区经理", "right")} d_error = {**d_error, **check_inconsist(df, "地区经理", "负责代表", "right")} d_error = {**d_error, **check_inconsist(df, "医院编码", "省/自治区/直辖市", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "是否双call", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "医院级别", "left")} d_error = {**d_error, **check_inconsist(df, "医院编码", "开户进展", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "省/自治区/直辖市", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "是否双call", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "医院级别", "left")} d_error = {**d_error, **check_inconsist(df, "医院全称", "开户进展", "left")} d_error = {**d_error, **check_hplevel_with_dept(df)} # 检查医院级别和所在科室是否出现矛盾 return d_error
EmptyStringValidation = CustomElementValidation(lambda d: d != "", "This field cannot be empty") nipt_results_schema = Schema([ Column("SampleID", [TrailingWhitespaceValidation(), EmptyStringValidation]), Column("SampleType", []), Column("Description", []), Column("SampleProject", [TrailingWhitespaceValidation(), EmptyStringValidation]), Column("Index1", []), Column("Index2", []), Column("Library_nM", []), Column("QCFlag", []), Column("Zscore_13", [CanConvertValidation(float)]), Column("Zscore_18", [CanConvertValidation(float)]), Column("Zscore_21", [CanConvertValidation(float)]), Column("Zscore_X", [CanConvertValidation(float)]), Column("Ratio_13", [CanConvertValidation(float)]), Column("Ratio_18", [CanConvertValidation(float)]), Column("Ratio_21", [CanConvertValidation(float)]), Column("Ratio_X", [CanConvertValidation(float)]), Column("Ratio_Y", [CanConvertValidation(float)]), Column("MappedReads", [CanConvertValidation(int)]), Column("GC_Dropout", [CanConvertValidation(float)]), Column("AT_Dropout", [CanConvertValidation(float)]), Column("Chr1_Ratio", [CanConvertValidation(float)]), Column("Chr2_Ratio", [CanConvertValidation(float)]), Column("Chr3_Ratio", [CanConvertValidation(float)]), Column("Chr4_Ratio", [CanConvertValidation(float)]),
'34': 'NCBI34', '35': 'NCBI35', '36': 'NCBI36', '37': 'GRCh37', '38': 'GRCh38' } VALID_FILE_EXTENSIONS = [ ".txt", ".tsv", ".csv", ".tsv.gz", ".csv.gz", "gz", "gzip", ".tsv.gzip", ".csv.gzip" ] GENERIC_VALIDATORS = { SNP_DSET: Column(SNP_DSET, [ CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$') ], allow_empty=True), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), BP_DSET: Column(BP_DSET, [ CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999) ], allow_empty=True), EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET])], allow_empty=True),
lambda x: x.is_monotonic_increasing and x.is_unique, 'date is not monotonic') ]) default_value_validators = [ LeadingWhitespaceValidation(), TrailingWhitespaceValidation() ] schemas_by_key = { 'cases': Schema([ date_validator, Column('cases', [ *default_value_validators, CanConvertValidation(int) & CustomSeriesValidation( lambda x: x.is_monotonic_increasing, 'cases is not monotonic') ]) ]), 'deaths': Schema([ date_validator, Column('deaths', [ *default_value_validators, CanConvertValidation(int) & CustomSeriesValidation( lambda x: x.is_monotonic_increasing, 'deaths is not monotonic') ]) ]), 'recovered': Schema([date_validator, Column('recovered', [*default_value_validators])]),
#VALIDATORS = { # PVAL_DSET: Column(PVAL_DSET, [CanConvertValidation(DSET_TYPES[PVAL_DSET]), InInclusiveRangeValidation(0, 1)], allow_empty=False), # OR_DSET: Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=True), # RANGE_U_DSET: Column(RANGE_U_DSET, [CanConvertValidation(float)], allow_empty=True), # RANGE_L_DSET: Column(RANGE_L_DSET, [CanConvertValidation(float)], allow_empty=True), # BETA_DSET: Column(BETA_DSET, [CanConvertValidation(float)], allow_empty=True), # SE_DSET: Column(SE_DSET, [CanConvertValidation(float)], allow_empty=True), # EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGNactgn]+$')], allow_empty=True), # OTHER_DSET: Column(OTHER_DSET, [MatchesPatternValidation(r'^[ACTGNactgn]+$')], allow_empty=True), # FREQ_DSET: Column(FREQ_DSET, [CanConvertValidation(float)], allow_empty=True) #} SNP_VALIDATORS = { SNP_DSET: Column(SNP_DSET, [ CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^rs[0-9]+$') ], allow_empty=False), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), BP_DSET: Column(BP_DSET, [ CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999) ], allow_empty=True), PVAL_DSET: Column(PVAL_DSET, [ CanConvertValidation(DSET_TYPES[PVAL_DSET]), InInclusiveRangeValidation(0, 1)
'34': 'NCBI34', '35': 'NCBI35', '36': 'NCBI36', '37': 'GRCh37', '38': 'GRCh38' } VALIDATORS = { SNP_DSET: Column(SNP_DSET, [MatchesPatternValidation(r'rs[0-9]+') ]), # how do we handle the values that are like chr:bp:allele:snp? PVAL_DSET: Column( PVAL_DSET, [CanConvertValidation(float), InInclusiveRangeValidation(0, 1)] #CustomElementValidation(lambda s: float(s) >= 0 and float(s) <= 1, 'outside the range of 0 to 1')] ), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), BP_DSET: Column( BP_DSET, [CanConvertValidation(int) & InInclusiveRangeValidation(1, 999999999)], allow_empty=True), OR_DSET: Column(OR_DSET, [CanConvertValidation(float)], allow_empty=True), RANGE_U_DSET: Column(RANGE_U_DSET, [CanConvertValidation(float)], allow_empty=True), RANGE_L_DSET:
from pandas_schema.validation import MatchesPatternValidation, InRangeValidation, InListValidation, CustomSeriesValidation, CustomElementValidation, CanConvertValidation, IsDtypeValidation, CanCallValidation from validate.helpers import InInclusiveRangeValidation from validate.common_constants import * VALID_COLS = TO_LOAD_DSET_HEADERS_DEFAULT VALID_CHROMOSOMES = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', 'X', 'Y' ] VALIDATORS = { PVAL_DSET: Column(PVAL_DSET, [ CanConvertValidation(DSET_TYPES[PVAL_DSET]), InInclusiveRangeValidation(0, 1) ], allow_empty=False), BETA_DSET: Column(BETA_DSET, [CanConvertValidation(float)], allow_empty=True), SNP_DSET: Column(SNP_DSET, [ CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation( r'^chr[0-9XY]+_[0-9]+_[ACTGNactgn]+_[ACTGNactgn]+|LONG_STRING$') ], allow_empty=True), CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False), BP_DSET:
from pandas_schema import Column, Schema from pandas_schema.validation import MatchesPatternValidation, CanConvertValidation, CustomSeriesValidation import pandas as pd schema = Schema([ Column('col1', [ CanConvertValidation(int) | (CustomSeriesValidation( lambda x: x.str.len() > 1, 'Doesn\'t have more than 1 character') & MatchesPatternValidation('a')) ]) ]) test_data = pd.DataFrame({'col1': ['an', '13', 'a', '8', 'the']}) errors = schema.validate(test_data) for error in errors: print('"{}" failed!'.format(error.value))
from pandas_schema import Column, Schema from pandas_schema.validation import (LeadingWhitespaceValidation, TrailingWhitespaceValidation, CanConvertValidation, MatchesPatternValidation, CustomSeriesValidation, InRangeValidation, InListValidation, DateFormatValidation) study_schema = Schema([ Column('study_id', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the study_id column.') & ~InListValidation([''])]), Column('pi_name', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_name column.') & ~InListValidation([''])]), Column('sample_type', [InListValidation(['wmgx', 'wmtx', '16S', 'other'])]), Column('bioproject_accession', [InListValidation(['']) | MatchesPatternValidation(r'PRJ\w+\d+')]), Column('geo_loc_name', [InListValidation(['']) | MatchesPatternValidation(r'\w+:\w+:\w+')]), Column('analysis_desc', [InListValidation(['']) | CanConvertValidation(str)]), Column('sequencing_facility', [LeadingWhitespaceValidation()]), Column('env_biom', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]), Column('env_feature', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]), Column('env_material', [MatchesPatternValidation(r'ENVO:\d+') | InListValidation([''])]), Column('host_tissue_sampled', [InListValidation(['']) | MatchesPatternValidation(r'BTO:\d+')]), Column('animal_vendor', [LeadingWhitespaceValidation()]), Column('paired', [InListValidation(['true', 'false'])]), Column('paired_id', [InListValidation(['']) | MatchesPatternValidation(r'[a-zA-Z0-9_.]+')]) , Column('pi_email', [CustomSeriesValidation(lambda x: ~x.isnull(), 'A value is required for the pi_email column.') & ~InListValidation([''])]) ]) sample_schema = Schema([ Column('host_subject_id', [MatchesPatternValidation(r'\w+', message='Host Subject ID may only contain alphanumeric characters.')]),