Ejemplo n.º 1
0
def validate_data(df):
    schema = Schema([
        Column('brand', null_validation + string_validation),
        Column('gear', null_validation + string_validation),
        Column('model', null_validation + string_validation),
        Column('price',
               null_validation + int_validation + price_max_validation,
               price_min_validation),
        Column('fuel', null_validation + string_validation),
        Column('mileage',
               null_validation + float_validation + mileage_max_validation),
        Column(
            'hp', null_validation + float_validation + hp_min_validation +
            hp_max_validation),
        Column('type', null_validation + string_validation),
        Column('geo', null_validation + string_validation),
        Column('model_year', null_validation + float_validation),
    ])
    try:
        errors = schema.validate(df)
        for e in errors:
            print(e)
    except:
        return False
    else:
        if not errors:
            return True
        else:
            return False
 def validate_data(self):
     self.setup_field_validation()
     if not self.open_file_and_check_for_squareness():
         logger.error(
             "Please fix the table. Some rows have different numbers of columns to the header"
         )
         logger.info(
             "Rows with different numbers of columns to the header are not validated"
         )
     for chunk in self.df_iterator():
         to_validate = chunk[self.cols_to_read]
         to_validate.columns = self.cols_to_validate  # sets the headers to standard format if neeeded
         # validate the snp column if present
         if SNP_DSET in self.header:
             self.schema = Schema(
                 [SNP_VALIDATORS[h] for h in self.cols_to_validate])
             errors = self.schema.validate(to_validate)
             self.store_errors(errors, self.snp_errors)
         if CHR_DSET and BP_DSET in self.header:
             self.schema = Schema(
                 [POS_VALIDATORS[h] for h in self.cols_to_validate])
             errors = self.schema.validate(to_validate)
             self.store_errors(errors, self.pos_errors)
         self.process_errors()
         if len(self.bad_rows) >= self.error_limit:
             break
     if not self.bad_rows:
         logger.info("File is valid")
         return True
     else:
         logger.info(
             "File is invalid - {} bad rows, limit set to {}".format(
                 len(self.bad_rows), self.error_limit))
         return False
Ejemplo n.º 3
0
def validate_results(conn, args, filepath):
  """Validates input file for GWAS result data

  This function validates that the contents of a file to contain GWAS result data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  # For each column, add it to the schema, and then for known ones, add the 
  # schema validation. Use fuzzy comparisons when possible
  schema_columns = []
  for col in df.columns:
    validators = []
    if re.match("(SNP)|(chr)|(pos)|(nSNPs)", col, re.IGNORECASE):
      validators.append(CanConvertValidation(int))
    # Look for any of the p-values and make sure that they can be cast as a float
    if re.match("((null)?pval(ue)?)", col, re.IGNORECASE):
      validators.append(CanConvertValidation(float))
    
    schema_columns.append(Column(col, validators))
  schema = Schema(schema_columns)

  err = schema.validate(df)
  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Ejemplo n.º 4
0
    def test_schema(self):
        """
        Test this validation inside a schema, to ensure we get helpful error messages.
        In particular, we want to make sure that a ValidationWarning without a row number won't break the schema
        """
        df = pd.DataFrame(
            data={
                'wrong_dtype1': ['not_an_int'],
                'wrong_dtype2': [123],
                'wrong_dtype3': [12.5]
            })

        schema = Schema([
            Column('wrong_dtype1', [IsDtypeValidation(dtype('int64'))]),
            Column('wrong_dtype2', [IsDtypeValidation(dtype('float64'))]),
            Column('wrong_dtype3', [IsDtypeValidation(dtype('int64'))]),
        ])

        errors = schema.validate(df)

        self.assertEqual(
            sorted([str(x) for x in errors]),
            sorted([
                'The column wrong_dtype1 has a dtype of object which is not a subclass of the required type int64',
                'The column wrong_dtype2 has a dtype of int64 which is not a subclass of the required type float64',
                'The column wrong_dtype3 has a dtype of float64 which is not a subclass of the required type int64'
            ]))
Ejemplo n.º 5
0
def validate_line(conn, args, filepath):
  """Validates input file for line data

  This function validates that the contents of a file to contain line data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  schema = Schema([
    Column('line_name', [
      IsDistinctValidation()
    ])
  ])

  df = pd.read_csv(filepath, header=None)

  if len(df.columns) != 1:
    raise Exception(f"Invalid file format. Excepted 1 column found {len(df.columns)} columns. This file should be a single column of each line. Each entry should be distinct.")
  
  df.columns = [ 'line_name' ]
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Ejemplo n.º 6
0
    def __init__(
        self,
        args: Namespace,
        sources: Dict[str, Any],
        schema: List[Tuple[str, np.generic]],
        destinations: Dict[str, Any],
        stage: str,
        task: str,
    ):
        """Initiate parameters and client libraries for ETL task.

        :param args: args passed from command line,
        see `get_arg_parser()`
        :param sources: data source to be extracted,
        specified in task config, see `configs/*.py`
        :param schema: the target schema to load to.
        :param destinations: destinations to load data to,
        specified in task config, see `configs/*.py`
        :param stage: the stage of the loaded data, could be staging/production.
        :param task: the name of the task.
        """
        # Clear cached files
        if args.rm:
            for source in sources:
                files = []
                files += glob.glob(
                    get_path_format(True).format(
                        prefix=destinations["fs"]["prefix"],
                        stage="raw",
                        task=args.task,
                        source=source,
                    ))
                files += glob.glob(
                    get_path_format(True).format(
                        prefix=destinations["fs"]["prefix"],
                        stage=stage,
                        task=args.task,
                        source=source,
                    ))
                for f in files:
                    log.info("Removing cached file: %s" % f)
                    os.remove(f)
        self.task = task
        self.stage = stage
        self.args = args
        self.period = args.period
        self.current_date = args.date
        self.last_month = lookback_dates(args.date, args.period)
        self.sources = sources
        coltypes = []
        for coltype in schema:
            coltypes += [Column(coltype[0], [IsDtypeValidation(coltype[1])])]
        self.schema = Schema(coltypes)
        self.raw_schema = schema
        self.destinations = destinations
        self.raw = dict()
        self.extracted_base = dict()
        self.extracted = dict()
        self.transformed = dict()
        self.gcs = storage.Client()
Ejemplo n.º 7
0
def validate_variant(conn, args, filepath):
  """Validates input file for variant data

  This function validates that the contents of a file to contain variant data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  schema = Schema([
    Column('chr', [
      CanConvertValidation(int)
    ]),
    Column('pos', [
      CanConvertValidation(int),
      IsDistinctValidation()
    ])
  ])

  df = pd.read_csv(filepath, sep='\t', header=None)

  if len(df.columns) != 2:
    raise Exception(f"Invalid file format. Excepted 2 columns, found {len(df.columns)} columns. Columns should consist of chromsome number and SNP position. Filepath: {filepath}")

  df.columns = ['chr', 'pos']
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Ejemplo n.º 8
0
def function_validation(filename, cand_stage):
    df = pd.read_excel(filename, sheet_name='Mobilizer')
    df = df.fillna('')
    #print(df.columns)
    df['date_age'] = df['Age*'] + df['Date of Birth*'].astype(str)

    if cand_stage == str(1):
        schema = Schema([
            #nan check column non mandate
            Column('Candidate Photo', null_validation),
            Column('Middle Name', null_validation),
            Column('Last Name', null_validation),
            Column('Secondary Contact  No', null_validation),
            Column('Email id', null_validation),
            Column('Present Panchayat', null_validation),
            Column('Present Taluk/Block', null_validation),
            Column('Present Address line1', null_validation),
            Column('Present Address line2', null_validation),
            Column('Present Village', null_validation),
            Column('Permanent Address line1', null_validation),
            Column('Permanent Address line2', null_validation),
            Column('Permanent Village', null_validation),
            Column('Permanent Panchayat', null_validation),
            Column('Permanent Taluk/Block', null_validation),
            #str+null check
            Column('Fresher/Experienced?*', str_validation + null_validation),
            Column('Salutation*', str_validation + null_validation),
            Column('First Name*', str_validation + null_validation),
            Column('Gender*', str_validation + null_validation),
            Column('Marital Status*', str_validation + null_validation),
            Column('Caste*', str_validation + null_validation),
            Column('Disability Status*', str_validation + null_validation),
            Column('Religion*', str_validation + null_validation),
            Column('Source of Information*', str_validation + null_validation),
            Column('Present District*', str_validation + null_validation),
            Column('Present State*', str_validation + null_validation),
            Column('Present Country*', str_validation + null_validation),
            Column('Permanent District*', str_validation + null_validation),
            Column('Permanent State*', str_validation + null_validation),
            Column('Permanent Country*', str_validation + null_validation),
            #pincode check
            Column('Present Pincode*', pincode_validation + null_validation),
            Column('Permanent Pincode*', pincode_validation + null_validation),
            #mobile number check
            Column('Primary contact  No*', mob_validation + null_validation),
            #date of birth and age pass(null check)
            Column('Date of Birth*', null_validation),
            Column('Age*', null_validation),
            Column('date_age', dob_validation)
        ])
    errors = schema.validate(df)
    errors_index_rows = [e.row for e in errors]

    pd.DataFrame({'col': errors}).to_csv('errors.csv')
    df_clean = df.drop(index=errors_index_rows)
    df_clean.to_csv('clean_data.csv', index=None)
    return (len(errors_index_rows))
Ejemplo n.º 9
0
def validate_genotype(conn, args, filepath):
  """Validates input file for genotype data

  This function validates that the contents of a file to contain genotype data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  # Allow for users to skip this validation step because it is time consuming
  if args.skip_genotype_validation is True:
    return

  
  schema_columns = [
    Column('row_number', [
      CanConvertValidation(int) &
      IsDistinctValidation()
    ])
  ]

  # Get the number of lines from the .pos counterpart file
  pos_filepath = '.'.join([filepath, 'pos'])
  if not os.path.exists(pos_filepath):
    raise FileNotFoundError(f"Count not locate the position counterpart file for {filepath}")
  nPositions = len(pd.read_csv(pos_filepath, header=None).index)

  for n in range(nPositions):
    schema_columns.append(
      Column(f'pos_{n}', [
        CanConvertValidation(int) &
        CustomSeriesValidation(lambda x: x.int in [-1,0,1,2], 'Incorrectly coded value.')
      ])
    )

  schema = Schema(schema_columns)

  df = pd.read_csv(filepath, sep='\t', header=None)

  err = schema.validate(df)
  
  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Ejemplo n.º 10
0
def defSchema():
    print('Define expected Schema')
    schema = Schema([
        Column(name='id',
               validations=[IsDtypeValidation(np.object_)],
               allow_empty=False),
        Column(name='comment_text',
               validations=[IsDtypeValidation(np.object_)],
               allow_empty=False),
        Column(name='toxic',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='severe_toxic',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='obscene',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='threat',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='insult',
               validations=[InListValidation([0, 1])],
               allow_empty=False),
        Column(name='identity_hate',
               validations=[InListValidation([0, 1])],
               allow_empty=False)
    ])
    return schema
Ejemplo n.º 11
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column("RA_Report #", [CanConvertValidation(int)]),
             Column("RA_CAERS Created Date", [CanCallValidation(self.parse_date)]),
             Column(
                 "AEC_Event Start Date",
                 [CanCallValidation(self.parse_date)],
                 allow_empty=True,
             ),
             Column(
                 "PRI_Product Role", [InListValidation(["Suspect", "Concomitant"])]
             ),
             Column("PRI_Reported Brand/Product Name"),
             Column("PRI_FDA Industry Code"),
             Column("PRI_FDA Industry Name"),
             Column("CI_Age at Adverse Event"),
             Column(
                 "CI_Age Unit",
                 [
                     InListValidation(
                         ["Year(s)", "Decade(s)", "Month(s)", "Week(s)", "Day(s)"]
                     )
                 ],
             ),
             Column("CI_Gender", [InListValidation(["Female", "Male"])]),
             Column("AEC_One Row Outcomes"),
             Column("SYM_One Row Coded Symptoms"),
         ]
     )
Ejemplo n.º 12
0
    def compile_source_data_validation_schema(self, dataset):
        field_names = []
        field_schemas = []

        if dataset is not None:
            sub_map = Mapper.filter_mapper(self.mapper_df, dataset)
        else:
            sub_map = self.mapper_df

        sub_map = sub_map[sub_map['allow_missing'].str.lower() != 'y'].dropna(
            subset=['source_field_name', 'source_field_type'])

        for idx, field in sub_map.iterrows():

            field_validator = self.compile_field_validator(field)
            if field_validator:
                field_names.append(field['source_field_name'])

                field_schemas.append(
                    Column(field['source_field_name'], field_validator))

        if not field_schemas:
            return None, []

        schema = Schema(field_schemas)
        return schema, field_names
Ejemplo n.º 13
0
class ClassifierParser(BaseParser):
    """
    Implementation of classifier dao Parser.

    The classifier output tables contain the output data from geniepy after the
    classifiers have calculated desired predictions.
    """

    default_type: DataType = None
    scraper: None
    """No online sources for classifiers output."""
    schema: Schema = Schema([
        Column("digest"),
        Column(PCPCLSFR_NAME, [IsDtypeValidation(np.float64)]),
        Column(CTCLSFR_NAME, [IsDtypeValidation(np.float64)]),
    ])

    def fetch(self, chunksize: int) -> Generator[DataFrame, None, None]:
        """No online sources to fetch from for classifiers outputs."""
        raise NotImplementedError("Classifier Output Parser has no Scrapers")

    @staticmethod
    def parse(data, dtype=DataType.CSV_STR) -> DataFrame:
        """
        Parser function from base class.

        Raises:
            NotImplementedError -- Function not implemented since classifiers return
                dataframes that only need to be validated.
        """
        raise NotImplementedError("Classifier Output Parser has no Scrapers")
Ejemplo n.º 14
0
def validate_phenotype(conn, args, filepath):
  """Validates input file for phenotype data

  This function validates that the contents of a file to contain phenotype data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  nrows, ncols = df.shape
  nrows += 1 # include the header in the row count

  if re.match('(genotype)|(pedigree)|(line)', df.columns[0], re.IGNORECASE) is None:
    raise Exception("Genotype/pedigree/line should be the first column in the phenotype file")


  # Rename the first column of data to be the genotypes/lines
  df.rename(columns={f'{df.columns[0]}': 'genotype'}, inplace=True)

  schema_columns = [
    Column('genotype', [
      IsDistinctValidation()
    ])
  ]

  for n in range(1, ncols):
    schema_columns.append(
      Column(df.columns[n], [
        # NOTE(tparker): This may not always be true. If there any phenotypes that
        # are listed as categories or strings, then this would fail
        # Find out all the possible phenotype values. It may be difficult to
        # validate input data without a user-provided dtype list
        CanConvertValidation(float)
      ])
    )

  schema = Schema(schema_columns)
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Ejemplo n.º 15
0
    def check_join_cols(df1, df2, on):

        schema = Schema([
            Column(
                col,
                [
                    LeadingWhitespaceValidation(),
                    TrailingWhitespaceValidation(),
                    IsDistinctValidation()
                ],
            ) for col in on
        ])
        results = [schema.validate(df) for df in [df1[on], df2[on]]]

        if len(results) > 0:
            print("The following issues exist in the index:")
            for error in itertools.chain(*results):
                print(error)
Ejemplo n.º 16
0
    def _validate(self, diagnosis_df):
        schema = Schema([
            Column('visit_dt', [
                MatchesPatternValidation(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:00$')
            ]),
            Column('sex', [InListValidation(['M', 'K'])]),
            Column('icd10', [
                MatchesPatternValidation(r'^[CDIJKMNRZ]{1}\d{1,2}.?\d{0,2}$')
            ])
        ])

        errors = schema.validate(diagnosis_df)

        for error in errors:
            self.Logger.error(error)

        if len(errors) > 0:
            exit()
Ejemplo n.º 17
0
class UnorderedSchema(unittest.TestCase):
    schema = Schema(
        [Column('a'),
         Column('b', [LeadingWhitespaceValidation()])],
        ordered=False)

    def test_fields(self):
        self.assertEqual(len(self.schema.columns), 2,
                         'The schema is not storing all of its columns')
        self.assertEqual(
            self.schema.ordered, False,
            'The schema is not storing the correct value of ordered')

    def test_validate_valid(self):
        df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['1', '2', '3']})
        results = self.schema.validate(df)
        self.assertEqual(len(results), 0,
                         'A correct data frame should have no errors')

    def test_validate_invalid(self):
        df = pd.DataFrame({'a': [' 1', '2', '3'], 'b': [' 1', '2', '3']})
        results = self.schema.validate(df)
        self.assertEqual(len(results), 1,
                         'An incorrect data frame should report errors')

    def test_mixed_columns(self):
        """
        Tests that when ordered=False, the schema columns are associated with data frame columns by name, not position.
        In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in
        column b in the data frame (leading whitespace), and a validation on column b in the schema.

        Schema         a                b (validation)
        Data Frame     b (error)        a

        Thus there will only be an error if column b in the schema is linked to column b in the data frame,
        as is correct behaviour.
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)
        results = self.schema.validate(df)

        self.assertEqual(len(results), 1, 'There should be 1 error')
        self.assertEqual(results[0].row, 0)
        self.assertEqual(
            results[0].column, 'b',
            'The Schema object is not associating columns and column schemas by name'
        )
Ejemplo n.º 18
0
class StockPrice:
    """
    Model representing stock prices over time
    """

    fieldSchema = Schema([
        Column('open', [IsDtypeValidation(np.float64)]),
        Column('close', [IsDtypeValidation(np.float64)]),
        Column('high', [IsDtypeValidation(np.float64)]),
        Column('low', [IsDtypeValidation(np.float64)]),
        Column('volume', [IsDtypeValidation(np.int64)])
    ])
Ejemplo n.º 19
0
def validate_population_structure(conn, args, filepath):
  """Validates input file for population structure data

  This function validates that the contents of a file to contain population
  structure data. If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  nrows, ncols = df.shape
  nrows += 1 # include the header rows in the count
  logging.debug(f'Population structure columns: {df.columns}')
  logging.debug(f"Population structure dimensions: <{nrows}, {ncols}>")


  schema_columns = [
    Column('Pedigree', [
      IsDistinctValidation()
    ])
  ]

  for n in range(1, ncols):
    schema_columns.append(Column(df.columns[n], [
      CanConvertValidation(float)
    ]))

  schema = Schema(schema_columns)
  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Ejemplo n.º 20
0
def validate_kinship(conn, args, filepath):
  """Validates input file for kinship data

  This function validates that the contents of a file to contain kinship data.
  If an error is encountered, throw an exception.

  Args:
    conn (psycopg2.extensions.connection): psycopg2 connection
    args (ArgumentParser namespace): user-defined arguments
    filepath (str): location of input file
  
  """
  df = pd.read_csv(filepath)
  nrows, ncols = df.shape
  df.rename(columns = {"Unnamed: 0": "line_name"}, inplace=True) # since column name is blank by default, rename it for later reference
  nrows += 1 # include the header row in the count
  logging.debug(f"Dimensions of kinship matrix: <{nrows}, {ncols}>")

  schema_columns = [
    Column('line_name', [
      IsDistinctValidation()
    ])
  ]

  for n in range(1, ncols):
    schema_columns.append(Column(df.columns[n], [
      CanConvertValidation(float)
    ]))

  schema = Schema(schema_columns)

  err = schema.validate(df)

  if err:
    for e in err:
      logging.error(f"Error encountered while validating: {filepath}")
      raise Exception(e)
Ejemplo n.º 21
0
    def create_schema(self) -> Schema:
        """ Create Pandas schema with all the necessary validation rules read in from config """
        col_list = []
        for column in self.__spreadsheet_def.keys():
            validators = [
                LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()
            ]

            mandatory_field_flag = self.__spreadsheet_def.is_mandatory(column)

            # Special cases for checking institutions/countries...
            if column == 'submitting_institution':
                validators.append(
                    InListValidation([i.name for i in self.__institutions]))
            if column == 'country':
                validators.append(
                    InListValidation([i.country for i in self.__institutions]))
            else:
                # Regex validation
                if self.__spreadsheet_def.get_regex(column):
                    validators.append(
                        MatchesPatternValidation(
                            self.__spreadsheet_def.get_regex(column),
                            message=self.__spreadsheet_def.
                            get_regex_validation_message(column)))

                # Validate allowed values
                elif self.__spreadsheet_def.get_allowed_values(column):
                    validators.append(
                        InListValidation(
                            self.__spreadsheet_def.get_allowed_values(column),
                            case_sensitive=False))

                # Field length validation
                max_len = self.__spreadsheet_def.get_max_length(column)
                if max_len and max_len > 0:
                    validators.append(
                        _StringLengthValidation(
                            'field length is greater than {} characters'.
                            format(str(max_len)), max_len))

            # Mandatory field validation
            col_list.append(
                Column(self.__spreadsheet_def.get_column_name(column),
                       validators,
                       allow_empty=not mandatory_field_flag))

        return Schema(col_list)
Ejemplo n.º 22
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column(
                 "Given Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "Family Name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("Age", [InRangeValidation(0, 120)]),
             Column("Sex", [InListValidation(["Male", "Female", "Other"])]),
             Column("Customer ID", [MatchesPatternValidation(r"\d{4}[A-Z]{4}")]),
         ]
     )
Ejemplo n.º 23
0
 def __init__(self):
     self.schemas = Schema(
         [
             Column("id"),
             Column(
                 "payer_name",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("document_amount"),
             Column("payed_amount"),
             Column("payer_id_number"),
             Column(
                 "payer_address",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("barcode"),
             Column("typable_line"),
             Column("number"),
             Column(
                 "document_number",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("due_date", [DateFormatValidation("%m/%d/%y")]),
             Column(
                 "city",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column(
                 "state",
                 [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()],
             ),
             Column("zip_code"),
             Column("bank_answer_date"),
             Column("pdf_upload_date"),
             Column(
                 "status", [InListValidation(["pending", "paid", "due", "error"])]
             ),
             Column("callback"),
             Column("object_id"),
             Column("extra"),
         ]
     )
Ejemplo n.º 24
0
    def run(self, repo_uri: str = None, query: str = None, schema: Schema = None, layer_query: bool = None, **kwargs: Any):
        """  

        Args:

        Returns:
            - No return
        """
        assert repo_uri, 'Must specify repo_uri.'
        repo_info = parse_repo(repo_uri)

        repo = Repository(namespace=repo_info.namespace, repository=repo_info.repository)
        data = sql_to_df(query, repository=repo, use_lq=layer_query)        

        if schema is not None:
            errors = schema.validate(data)
            if errors:
                raise SchemaValidationError(errors)
        
        return data
Ejemplo n.º 25
0
class OrderedSchema(unittest.TestCase):
    schema = Schema(
        [Column('a', [LeadingWhitespaceValidation()]),
         Column('b')],
        ordered=True)

    def test_mixed_columns(self):
        """
        Tests that when ordered=True, the schema columns are associated with data frame columns by position, not name.

        In this case, the schema's column order is [a, b], while the data frame's order is [b, a]. There is an error in
        column b in the data frame (leading whitespace), and a validation on column a in the schema.

        Schema         a (validation)   b
        Data Frame     b (error)        a

        Thus there will only be an error if column b in the schema is linked to column a in the data frame,
        as is correct behaviour when ordered=True.
        """
        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)
        results = self.schema.validate(df)

        self.assertEqual(len(results), 1, 'There should be 1 error')
        self.assertEqual(results[0].row, 0)
        self.assertEqual(
            results[0].column, 'b',
            'The Schema object is not associating columns and column schemas by position'
        )
Ejemplo n.º 26
0
from pandas_schema import Column, Schema
from pandas_schema.validation import MatchesPatternValidation, CanConvertValidation, CustomSeriesValidation
import pandas as pd

schema = Schema([
    Column('col1', [
        CanConvertValidation(int) | (CustomSeriesValidation(
            lambda x: x.str.len() > 1, 'Doesn\'t have more than 1 character')
                                     & MatchesPatternValidation('a'))
    ])
])

test_data = pd.DataFrame({'col1': ['an', '13', 'a', '8', 'the']})

errors = schema.validate(test_data)

for error in errors:
    print('"{}" failed!'.format(error.value))
Ejemplo n.º 27
0
class UnorderedSchema(unittest.TestCase):
    schema = Schema(
        [Column('a'),
         Column('b', [LeadingWhitespaceValidation()])],
        ordered=False)

    def test_fields(self):
        self.assertEqual(len(self.schema.columns), 2,
                         'The schema is not storing all of its columns')
        self.assertEqual(
            self.schema.ordered, False,
            'The schema is not storing the correct value of ordered')

    def test_validate_valid(self):
        df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['1', '2', '3']})
        results = self.schema.validate(df)
        self.assertEqual(len(results), 0,
                         'A correct data frame should have no errors')

    def test_validate_invalid(self):
        df = pd.DataFrame({'a': [' 1', '2', '3'], 'b': [' 1', '2', '3']})
        results = self.schema.validate(df)
        self.assertEqual(len(results), 1,
                         'An incorrect data frame should report errors')

    def test_mixed_columns(self):
        """
        Tests that when ordered=False, the schema columns are 
        associated with data frame columns by name, not position.
        In this case, the schema's column order is [a, b], while
         the data frame's order is [b, a]. There is an error in
        column b in the data frame (leading whitespace), and a 
        validation on column b in the schema.

        Schema         a                b (validation)
        Data Frame     b (error)        a

        Thus there will only be an error if column b in the schema 
        is linked to column b in the data frame, as is correct
        behaviour.
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)
        results = self.schema.validate(df)

        self.assertEqual(len(results), 1, 'There should be 1 error')
        self.assertEqual(results[0].row, 0)
        self.assertEqual(
            results[0].column, 'b',
            'The Schema object is not associating columns and column schemas by name'
        )

    def test_column_subset_detect(self):
        """
        Tests that when ordered=False, validation is possible by
        passing a subset of the columns contained in the schema

        Schema         a*                b (validation)
        Data Frame     b (error)        a not passed

        column* is not being passed

        Thus there will only be an error if column b in the schema
        is linked to column b in the data frame, as is correct 
        behaviour
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)

        results = self.schema.validate(df, columns=['b'])

        self.assertEqual(len(results), 1, 'There should be 1 error')
        self.assertEqual(results[0].row, 0)
        self.assertEqual(
            results[0].column, 'b',
            'The Schema object is not associating columns and column schemas by name'
        )

    def test_column_subset_detect_empty(self):
        """
        Tests that when ordered=False, validation is possible by
        passing a subset of the columns contained in the schema

        Schema         a                b* (validation)
        Data Frame     b (error)        a

        column* is not being passed

        There will be an error if other than zero errors are found.
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)
        # should detect no errors
        results_empty = self.schema.validate(df, columns=['a'])

        self.assertEqual(len(results_empty), 0, 'There should be no errors')

    def test_column_subset_error(self):
        """
        Tests that when ordered=False, validation is possible by
        passing a subset of the columns contained in the schema

        Schema         a                b (validation)
        Data Frame     b (error)        a 

        There will be an error if a column different than 'a' or 'b' is passed
        """

        df = pd.read_csv(StringIO('''
b,a
 1,1
2,3
3,3
        '''),
                         sep=',',
                         header=0,
                         dtype=str)

        # should raise a PanSchArgumentError
        self.assertRaises(PanSchArgumentError,
                          self.schema.validate,
                          df,
                          columns=['c'])
    def validate_and_annotate(self, file_path=''):
        ################################
        #   SCHEMAS VALIDATION    #
        ################################

        # FIX ME: this is a really janky way to do this. Passing empty string for file_path...
        # is a temp fix for review_changes re-validate data button causing an infinite loop when calling this with an argument :(
        if not file_path:
            file_path = self.dirname + '/for_review/intermediate_hospital_data.xlsx'

        try:
            df = pd.read_csv(file_path)
        except:
            try:
                df = pd.read_excel(file_path)
            except:
                print("UNACCEPTED FILE FORMAT")

        df.fillna("NULL", inplace=True)

        #~ print(file_path)
        #~ print(df.info)

        schema = Schema([
            Column('N95PlanFitTested', [InListValidation(['Y', 'N', 'NULL'])]),
            Column('PARPsPlanTrained', [InListValidation(['Y', 'N', 'NULL'])]),
        ])

        errors = schema.validate(df, columns=schema.get_column_names())

        #######################################
        # Build excel worksheet w formatting
        #######################################
        save_path2 = self.dirname + '/for_review/'
        if not os.path.exists(save_path2):
            os.makedirs(save_path2)

        writer = pd.ExcelWriter(save_path2 + 'intermediate_hospital_data.xlsx',
                                engine='xlsxwriter')

        # Skip row 1 headers so we can add manunally with formatting
        df.to_excel(writer,
                    sheet_name='Sheet1',
                    startrow=1,
                    header=False,
                    index=False)

        workbook = writer.book
        worksheet = writer.sheets['Sheet1']

        ### WORKBOOK FORMATS ###
        yellow_highlight = workbook.add_format({'bg_color': '#FFEB9C'})

        header = workbook.add_format({
            'bold': True,
            'text_wrap': True,
            'valign': 'top',
            'fg_color': '#D7E4BC',
            'border': 1
        })
        ########################

        # Set column widths
        worksheet.set_column('A:II', 30)
        worksheet.set_default_row(hide_unused_rows=True)

        # Write the column headers with the defined format.
        for col_num, value in enumerate(df.columns.values):
            worksheet.write(0, col_num, value, header)

        # for storing error row numbers while we iterate thru error object
        # will use for hiding rows
        error_rows = []
        df_length = len(df)

        for error in errors:

            error_rows.append(error.row)

            row = error.row + 1
            column = df.columns.get_loc(error.column)

            # Comments
            worksheet.write_comment(row, column, error.message)

            # Highlights
            worksheet.conditional_format(row, column, row, column, {
                'type': 'no_errors',
                'format': yellow_highlight
            })

        #~ print(error_rows);

        # Hide Rows that don't contain errors
        for i in range(df_length + 1):
            if i not in error_rows:
                worksheet.set_row(i + 1, None, None, {'hidden': True})

        writer.save()

        # Pop up
        self.review_changes()
Ejemplo n.º 29
0
import pandas as pd
from pandas_schema import Column, Schema
from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, InRangeValidation, \
    DateFormatValidation, InListValidation

schema = Schema([
    Column('name',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('title',
           [LeadingWhitespaceValidation(),
            TrailingWhitespaceValidation()]),
    Column('salary', [InRangeValidation(0, 33000)]),
    Column('sex', [InListValidation(['F', 'M'])]),
    Column('date', [DateFormatValidation('%Y-%m-%d')])
])

widths = [
    9,  # name
    19,  # title
    6,  # salary
    4,  # sex
    11,  # date
]

# read source data
test_data = pd.read_fwf("data/fixed_width.txt", widths=widths)
print('orig dataset')
print(test_data)

# data verification
Ejemplo n.º 30
0
def main():
    # Parse input arguments
    parser = get_parser()
    args = parser.parse_args()

    data_path = args.path_in

    path_tsv = os.path.join(data_path, 'participants.tsv')
    tsv_file = pd.read_csv(path_tsv, sep='\t')
    list_subj = [
        name for name in os.listdir(data_path) if
        os.path.isdir(os.path.join(data_path, name)) and name.startswith('sub')
    ]
    df = pd.DataFrame(tsv_file)
    list_tsv_participants = df['participant_id'].tolist()
    missing_subjects_tsv = list(set(list_subj) - set(list_tsv_participants))
    missing_subjects_folder = list(set(list_tsv_participants) - set(list_subj))

    if missing_subjects_tsv:
        # print ('Warning missing following subjects from participants.tsv : %s' %missing_subjects_tsv)
        print('\nWarning missing following subjects from participants.tsv: ')
        missing_subjects_tsv.sort()
        pprint(missing_subjects_tsv)
    if missing_subjects_folder:
        # print ('\nWarning missing data for subjects listed in participants.tsv : %s' %missing_subjects_folder)
        print(
            '\nWarning missing data for subjects listed in participants.tsv: ')
        missing_subjects_folder.sort()
        pprint(missing_subjects_folder)

    for dirName, subdirList, fileList in os.walk(data_path):
        for file in fileList:
            if file.endswith('.nii.gz'):
                originalFilePath = os.path.join(dirName, file)
                jsonSidecarPath = os.path.join(dirName,
                                               file.split(".")[0] + '.json')
                if os.path.exists(jsonSidecarPath) == False:
                    print("Missing jsonSidecar: " + jsonSidecarPath)

    # Checking participants.tsv contents
    schema = Schema([
        Column('participant_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('sex', [InListValidation(['M', 'F'])]),
        Column('age', [InRangeValidation(18, 60)]),
        Column('height', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('weight', [MatchesPatternValidation(r"[0-9]|-")]),
        Column('date_of_scan', [
            DateFormatValidation('%Y-%m-%d') | MatchesPatternValidation(r"-")
        ]),
        Column('institution_id',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('institution',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturer',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('manufacturers_model_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('receive_coil_name',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('software_versions',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
        Column('researcher',
               [LeadingWhitespaceValidation(),
                TrailingWhitespaceValidation()]),
    ])

    errors = schema.validate(tsv_file)
    print('\nChecking the contents of participants.tsv')
    if not errors:
        print("--> all good 👍")
    else:
        for error in errors:
            print(error)