def do_validate(self, df): """ Validations: - Check that all columns names (old and new) are not empty strings """ for c in self.mapping.keys(): if not isinstance(c, str) or c.strip == '': raise YeastValidationError( f'New column name "{c}" should be a non empty string') for c in self.mapping.values(): if not isinstance(c, str) or c.strip == '': raise YeastValidationError( f'Old column name "{c}" should be a non empty string')
def do_validate(self, df): """ Validations: - Columns must exist - Types must exist """ for c, t in self.mapping.items(): if c not in df.columns: raise YeastValidationError( f'Column {c} not found on the DataFrame') if t not in self.type_mapper.keys(): raise YeastValidationError( f'Data type {t} not available. Choose from: {self.type_mapper.keys()}' )
def do_validate(self, df): """ - Check that all columns are not empty strings - Check if the df contains all elements in columns """ columns = self.resolve_selector(self.selector, df) if not all(isinstance(c, str) for c in columns): raise YeastValidationError('Invalid column names') matches = [c in df.columns for c in columns] if not all(matches): missing_columns = [c for c, v in zip(columns, matches) if not v] raise YeastValidationError(f'The following columns are missing: {missing_columns}')
def do_validate(self, df): """ Validations: - Format is available """ if self.case not in self.cases: raise YeastValidationError(f'Invalid format names. Choose between: {self.cases}')
def do_validate(self, gdf): """ - A GroupByStep was applied before this step """ if not isinstance(gdf, DataFrameGroupBy): raise YeastValidationError( 'This step must be executed after a GroupByStep(...)')
def do_validate(self, df): """ Perform custom validations. Expected signature (step/self, dataframe) - Validate that all parameters are callables or none - Call to perform all custom validations """ if self.to_prepare is not None and not callable(self.to_prepare): raise YeastValidationError( 'to_prepare must be a function like: to_prepare(step, df)') if self.to_bake is not None and not callable(self.to_bake): raise YeastValidationError( 'to_bake must be a function like: to_bake(step, df)') if self.to_validate is not None and not callable(self.to_validate): raise YeastValidationError( 'to_validate must be a function like: to_validate(step, df)') if self.to_validate: self.to_validate(self, df)
def do_validate(self, df): """ - All columns on the mapping must exist on the df """ columns = self.resolve_selector(self.selector, df) matches = [c in df.columns for c in columns] if not all(matches): missing_columns = [c for c, v in zip(columns, matches) if not v] raise YeastValidationError(f'The following columns are missing: {missing_columns}')
def do_validate(self, df): """ Validations: - Check that expression is not empty """ if not isinstance(self.expression, str) or self.expression.strip() == '': raise YeastValidationError( 'The expression must be a non empty string')
def do_validate(self, df): """ - Mapping must be a string or a dict - All columns on the mapping must exist on the df """ if type(self.mapping) not in [dict, str]: raise YeastValidationError( f'The replacement mapping must be a string or dict') if isinstance(self.mapping, dict): matches = [c in df.columns for c in self.mapping.keys()] if not all(matches): missing_columns = [ c for c, v in zip(self.mapping.keys(), matches) if not v ] raise YeastValidationError( f'The following columns are missing: {missing_columns}') if isinstance(self.mapping, str) and self.mapping not in df.columns: raise YeastValidationError( f'The following column is missing: {self.mapping}')
def do_validate(self, df): """ - All keys should be valid column names - All values should be callables or transformers or list of those things """ for column in self.transformers.keys(): if not isinstance(column, str) and column.strip() == '': raise YeastValidationError( f'"{column}" is not a valid column name') for item in self.transformers.values(): if type(item) in [list, tuple]: for subitem in item: if not isinstance(subitem, Transformer) and not isinstance( subitem, LambdaType): raise YeastValidationError( f"Transformer {subitem} not recognized") else: if not isinstance(item, Transformer) and not isinstance( item, LambdaType): raise YeastValidationError( f"Transformer {item} not recognized")
def do_validate(self, df): """ - Check if the df contains all listed columns """ self.selector = self.resolve_selector(self.selector, df) matches = [c in df.columns for c in self.selector] if not all(matches): missing_columns = [ c for c, v in zip(self.selector, matches) if not v ] raise YeastValidationError( f'The following columns are missing: {missing_columns}')
def do_validate(self, x): """ - How must be a valid type - We are expecting DataFrame or Recipe on Right - Left must be a DataFrame - All columns in by must be on Left - All columns in by must be on Right - df is only supported if right is a Recipe """ # Validate How parameter if self.how not in self.how_types: raise YeastValidationError( f'Join type "{self.how}" not between: {self.how_types}') # We are expecting DataFrame or Recipe on Right if not isinstance(self.y, DataFrame) and not isinstance( self.y, Recipe): raise YeastValidationError( 'We are expecting a DataFrame or a Recipe or Right') # Left must be a DataFrame if not isinstance(x, DataFrame): raise YeastValidationError( "Previous Step didn't return a DataFrame") # All columns in by must be on Left if self.by: missing = [c for c in self.by if c not in x.columns] if missing: raise YeastValidationError( f"Columns {missing} not found on the x side of the merge") # All columns in by must be on Right if self.by and isinstance(self.y, DataFrame): missing = [c for c in self.by if c not in self.y.columns] if missing: raise YeastValidationError( f"Columns {missing} not found on the right side of the merge" ) # self.df is only supported if right is a Recipe if self.df is not None and not isinstance(self.y, Recipe): raise YeastValidationError( f"'df' parameter is only supported if right is a Recipe")
def do_validate(self, df): if 'name' not in df.columns or 'rank' not in df.columns: raise YeastValidationError('Name or Rank not found')
def validate_columns_but_fail(step, df): if 'age' not in df.columns: raise YeastValidationError('Age not found')