def test_unique_values(self):
     """Test if the validator checks for the unique values."""
     validator = DataFrameValidator(data=self.iris_dframe,
             column_rules={'Species': self.species_rules})
     cleaned = validator.clean()
     self.assertItemsEqual(cleaned.Species.unique(),
                           ['setosa', 'versicolor', 'virginica'])
Example #2
0
 def test_unique_values(self):
     """Test if the validator checks for the unique values."""
     validator = DataFrameValidator(
         data=self.iris_dframe,
         column_rules={'Species': self.species_rules})
     cleaned = validator.clean()
     self.assertItemsEqual(cleaned.Species.unique(),
                           ['setosa', 'versicolor', 'virginica'])
 def test_drop_duplicates(self):
     """Test if the DataFrameValidator is dropping duplicates properly."""
     col_rules = self.basespecs['iris'].get('column_rules')
     data = self.iris_dframe.copy()
     _data = pd.concat((data, data))
     validator = DataFrameValidator(data=_data, column_rules=col_rules)
     cleaned = validator.clean()
     self.assertDataFrameEqual(cleaned, data.drop_duplicates())
 def test_drop_duplicates(self):
     """Test if the DataFrameValidator is dropping duplicates properly."""
     col_rules = self.basespecs['iris'].get('column_rules')
     data = self.iris_dframe.copy()
     _data = pd.concat((data, data))
     validator = DataFrameValidator(data=_data, column_rules=col_rules)
     cleaned = validator.clean()
     self.assertDataFrameEqual(cleaned, data.drop_duplicates())
 def test_column_exclude_rules(self):
     """Test if the validator drops values excluded from columns."""
     col_rules = deepcopy(self.basespecs['iris']['column_rules'])
     col_rules['Species']['exclude'] = ['virginica', 'versicolor']
     dframe_val = DataFrameValidator(data=self.iris_dframe.copy(),
                                     column_rules=col_rules,
                                     rules={'drop_duplicates': False})
     cleaned_species = dframe_val.clean()['Species']
     self.assertItemsEqual(cleaned_species.unique().tolist(), ['setosa'])
     self.assertEqual(cleaned_species.shape[0], 50)
 def test_column_exclude_rules(self):
     """Test if the validator drops values excluded from columns."""
     col_rules = deepcopy(self.basespecs['iris']['column_rules'])
     col_rules['Species']['exclude'] = ['virginica', 'versicolor']
     dframe_val = DataFrameValidator(data=self.iris_dframe.copy(),
                                     column_rules=col_rules,
                                     rules={'drop_duplicates': False})
     cleaned_species = dframe_val.clean()['Species']
     self.assertItemsEqual(cleaned_species.unique().tolist(), ['setosa'])
     self.assertEqual(cleaned_species.shape[0], 50)
 def test_column_rules(self):
     """Test if the DataFrame validator reads and enforces the column rules
     properly.
     """
     dframe_val = DataFrameValidator(data=self.iris_dframe.copy(),
                        column_rules=self.basespecs['iris']['column_rules'])
     cleaned = dframe_val.clean()
     self.assertDataFrameEqual(cleaned, self.iris_dframe.drop_duplicates())
     dframe_val = DataFrameValidator(data=self.pa_dframe.copy(),
             column_rules=self.basespecs['person_activity']['column_rules'])
     cleaned = dframe_val.clean()
     self.assertDataFrameEqual(cleaned, self.pa_dframe.drop_duplicates())
Example #8
0
    def load_dataset(self, dataset_name):
        """Load and return a dataset.

        :param dataset_name: Name of the dataset
        :type dataset_name: str
        :return: A pandas DataFrame containing the dataset.
        :rtype: pandas.DataFrame
        :Example:

        >>> demo_project = Project('pysemantic_demo')
        >>> iris = demo_project.load_dataset('iris')
        >>> type(iris)
        pandas.core.DataFrame
        """
        validator = self.validators[dataset_name]
        column_rules = self.column_rules.get(dataset_name, {})
        df_rules = self.df_rules.get(dataset_name, {})
        parser_args = validator.get_parser_args()
        df_rules.update(validator.df_rules)
        logger.info("Attempting to load dataset {} with args:".format(
                                                                 dataset_name))
        if validator.is_spreadsheet:
            parser_args.pop('usecols', None)
        logger.info(json.dumps(parser_args, cls=TypeEncoder))
        if isinstance(parser_args, dict):
            with ParseErrorHandler(parser_args, self) as handler:
                df = handler.load()
            if df is None:
                raise ParserArgumentError("No valid parser arguments were " +
                                          "inferred from the schema.")
            if validator.is_spreadsheet and isinstance(validator.sheetname,
                                                       list):
                df = pd.concat(df.itervalues(), axis=0)
            logger.info("Success!")
            df_validator = DataFrameValidator(data=df, rules=df_rules,
                                             column_rules=column_rules)
            logger.info("Commence cleaning dataset:")
            logger.info("DataFrame rules:")
            logger.info(json.dumps(df_rules, cls=TypeEncoder))
            logger.info("Column rules:")
            logger.info(json.dumps(column_rules, cls=TypeEncoder))
            return df_validator.clean()
        else:
            dfs = []
            for argset in parser_args:
                with ParseErrorHandler(argset, self) as handler:
                    _df = handler.load()
                df_validator = DataFrameValidator(data=_df,
                                                  column_rules=column_rules)
                dfs.append(df_validator.clean())
            df = pd.concat(dfs, axis=0)
            return df.set_index(np.arange(df.shape[0]))
    def test_bad_unique_values(self):
        """Test if the validator drops values not specified in the schema."""
        # Add some bogus values
        noise = np.random.choice(['lily', 'petunia'], size=(50,))
        species = np.hstack((self.iris_dframe.Species.values, noise))
        np.random.shuffle(species)
        species = pd.Series(species)

        validator = DataFrameValidator(data=pd.DataFrame({'Species': species}),
                                  column_rules={'Species': self.species_rules})
        cleaned = validator.clean()
        self.assertItemsEqual(cleaned.Species.unique(),
                              ['setosa', 'versicolor', 'virginica'])
Example #10
0
 def test_colnames_as_list(self):
     """Test if the column names option works when provided as a list."""
     schema = deepcopy(self.basespecs['iris'])
     schema['header'] = 0
     ideal = ['a', 'b', 'c', 'd', 'e']
     schema['column_names'] = ideal
     validator = SchemaValidator(specification=schema)
     df = pd.read_csv(**validator.get_parser_args())
     rules = {}
     rules.update(validator.df_rules)
     df_val = DataFrameValidator(data=df, rules=rules)
     data = df_val.clean()
     self.assertItemsEqual(data.columns, ideal)
 def test_colnames_as_callable(self):
     translator = lambda x: "_".join([s.lower() for s in x.split()])
     self.basespecs['iris']['column_names'] = translator
     schema_val = SchemaValidator(specification=self.basespecs['iris'])
     parser_args = schema_val.get_parser_args()
     df = pd.read_csv(**parser_args)
     rules = {}
     rules.update(schema_val.df_rules)
     df_val = DataFrameValidator(data=df, rules=rules)
     data = df_val.clean()
     ideal = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
              'species']
     self.assertItemsEqual(data.columns, ideal)
 def test_colnames_as_list(self):
     """Test if the column names option works when provided as a list."""
     schema = deepcopy(self.basespecs['iris'])
     schema['header'] = 0
     ideal = ['a', 'b', 'c', 'd', 'e']
     schema['column_names'] = ideal
     validator = SchemaValidator(specification=schema)
     df = pd.read_csv(**validator.get_parser_args())
     rules = {}
     rules.update(validator.df_rules)
     df_val = DataFrameValidator(data=df, rules=rules)
     data = df_val.clean()
     self.assertItemsEqual(data.columns, ideal)
 def test_colnames_as_dict(self):
     """Test if column names gotten from SchemaValidator are implemented."""
     namemap = {'Sepal Length': 'slength', 'Sepal Width': 'swidth',
                'Petal Width': 'pwidth', 'Petal Length': 'plength',
                'Species': 'spcs'}
     self.basespecs['iris']['column_names'] = namemap
     schema_val = SchemaValidator(specification=self.basespecs['iris'])
     parser_args = schema_val.get_parser_args()
     df = pd.read_csv(**parser_args)
     rules = {}
     rules.update(schema_val.df_rules)
     df_val = DataFrameValidator(data=df, rules=rules)
     data = df_val.clean()
     self.assertItemsEqual(data.columns, namemap.values())
Example #14
0
    def test_bad_unique_values(self):
        """Test if the validator drops values not specified in the schema."""
        # Add some bogus values
        noise = np.random.choice(['lily', 'petunia'], size=(50, ))
        species = np.hstack((self.iris_dframe.Species.values, noise))
        np.random.shuffle(species)
        species = pd.Series(species)

        validator = DataFrameValidator(
            data=pd.DataFrame({'Species': species}),
            column_rules={'Species': self.species_rules})
        cleaned = validator.clean()
        self.assertItemsEqual(cleaned.Species.unique(),
                              ['setosa', 'versicolor', 'virginica'])
Example #15
0
 def test_colnames_as_callable(self):
     translator = lambda x: "_".join([s.lower() for s in x.split()])
     self.basespecs['iris']['column_names'] = translator
     schema_val = SchemaValidator(specification=self.basespecs['iris'])
     parser_args = schema_val.get_parser_args()
     df = pd.read_csv(**parser_args)
     rules = {}
     rules.update(schema_val.df_rules)
     df_val = DataFrameValidator(data=df, rules=rules)
     data = df_val.clean()
     ideal = [
         'sepal_length', 'sepal_width', 'petal_length', 'petal_width',
         'species'
     ]
     self.assertItemsEqual(data.columns, ideal)
Example #16
0
    def load_dataset(self, dataset_name):
        """Load and return a dataset.

        :param dataset_name: Name of the dataset
        :type dataset_name: str
        :return: A pandas DataFrame containing the dataset.
        :rtype: pandas.DataFrame
        :Example:

        >>> demo_project = Project('pysemantic_demo')
        >>> iris = demo_project.load_dataset('iris')
        >>> type(iris)
        pandas.core.DataFrame
        """
        validator = self.validators[dataset_name]
        column_rules = self.column_rules.get(dataset_name, {})
        df_rules = self.df_rules.get(dataset_name, {})
        parser_args = validator.get_parser_args()
        df_rules.update(validator.df_rules)
        logger.info("Attempting to load dataset {} with args:".format(
                                                                 dataset_name))
        logger.info(json.dumps(parser_args, cls=TypeEncoder))
        if isinstance(parser_args, dict):
            df = self._load(parser_args)
            if validator.is_spreadsheet and isinstance(validator.sheetname,
                                                       list):
                df = pd.concat(df.itervalues(), axis=0)
            logger.info("Success!")
            df_validator = DataFrameValidator(data=df, rules=df_rules,
                                             column_rules=column_rules)
            logger.info("Commence cleaning dataset:")
            logger.info("DataFrame rules:")
            logger.info(json.dumps(df_rules, cls=TypeEncoder))
            logger.info("Column rules:")
            logger.info(json.dumps(column_rules, cls=TypeEncoder))
            return df_validator.clean()
        else:
            dfs = []
            for argset in parser_args:
                self._update_parser(argset)
                _df = self.parser(**argset)
                df_validator = DataFrameValidator(data=_df,
                                                  column_rules=column_rules)
                dfs.append(df_validator.clean())
            df = pd.concat(dfs, axis=0)
            return df.set_index(np.arange(df.shape[0]))
Example #17
0
 def test_colnames_as_dict(self):
     """Test if column names gotten from SchemaValidator are implemented."""
     namemap = {
         'Sepal Length': 'slength',
         'Sepal Width': 'swidth',
         'Petal Width': 'pwidth',
         'Petal Length': 'plength',
         'Species': 'spcs'
     }
     self.basespecs['iris']['column_names'] = namemap
     schema_val = SchemaValidator(specification=self.basespecs['iris'])
     parser_args = schema_val.get_parser_args()
     df = pd.read_csv(**parser_args)
     rules = {}
     rules.update(schema_val.df_rules)
     df_val = DataFrameValidator(data=df, rules=rules)
     data = df_val.clean()
     self.assertItemsEqual(data.columns, namemap.values())
Example #18
0
    def load_dataset(self, dataset_name):
        """Load and return the dataset.

        :param dataset_name: Name of the dataset
        :type dataset_name: str
        :return: A pandas DataFrame containing the dataset.
        :rtype: pandas.DataFrame
        """
        validator = self.validators[dataset_name]
        column_rules = self.column_rules.get(dataset_name, {})
        df_rules = self.df_rules.get(dataset_name, {})
        args = validator.get_parser_args()
        if isinstance(args, dict):
            df = self._load(args)
            df_validator = DataFrameValidator(data=df, rules=df_rules,
                                             column_rules=column_rules)
            return df_validator.clean()
        else:
            dfs = []
            for argset in args:
                self._update_parser(argset)
                _df = self.parser(**argset)
                df_validator = DataFrameValidator(data=_df,
                                                  column_rules=column_rules)
                dfs.append(df_validator.clean())
            return pd.concat(dfs, axis=0)
 def test_column_rules(self):
     """Test if the DataFrame validator reads and enforces the column rules
     properly.
     """
     dframe_val = DataFrameValidator(data=self.iris_dframe.copy(),
                        column_rules=self.basespecs['iris']['column_rules'])
     cleaned = dframe_val.clean()
     self.assertDataFrameEqual(cleaned, self.iris_dframe.drop_duplicates())
     dframe_val = DataFrameValidator(data=self.pa_dframe.copy(),
             column_rules=self.basespecs['person_activity']['column_rules'])
     cleaned = dframe_val.clean()
     self.assertDataFrameEqual(cleaned, self.pa_dframe.drop_duplicates())
Example #20
0
    def load_dataset(self, dataset_name):
        """Load and return a dataset.

        :param dataset_name: Name of the dataset
        :type dataset_name: str
        :return: A pandas DataFrame containing the dataset.
        :rtype: pandas.DataFrame
        :Example:

        >>> demo_project = Project('pysemantic_demo')
        >>> iris = demo_project.load_dataset('iris')
        >>> type(iris)
        pandas.core.DataFrame
        """
        if dataset_name not in self.validators:
            self._init_validate(dataset_name)
        validator = self.validators[dataset_name]
        column_rules = self.column_rules.get(dataset_name, {})
        df_rules = self.df_rules.get(dataset_name, {})
        parser_args = validator.get_parser_args()
        df_rules.update(validator.df_rules)
        logger.info("Attempting to load dataset {} with args:".format(
            dataset_name))
        if validator.is_spreadsheet:
            parser_args.pop('usecols', None)
        logger.info(json.dumps(parser_args, cls=TypeEncoder))
        if isinstance(parser_args, dict):
            if validator.is_mysql or validator.is_postgresql:
                if not (
                        parser_args.get('table_name') or parser_args.get('query')):
                    raise ParserArgumentError(
                        "No table_name or query was provided for the "
                        "postgres configuration.")
                elif validator.sql_validator.chunksize is not None:
                    df = self._sql_iterator(parser_args)
                else:
                    df = self._sql_read(parser_args)
            else:
                with ParseErrorHandler(parser_args, self) as handler:
                    df = handler.load()
            if df is None:
                raise ParserArgumentError("No valid parser arguments were " +
                                          "inferred from the schema.")
            if validator.is_spreadsheet and isinstance(validator.sheetname,
                                                       list):
                df = pd.concat(df.itervalues(), axis=0)
            logger.info("Success!")
            df_validator = DataFrameValidator(data=df, rules=df_rules,
                                              column_rules=column_rules)
            logger.info("Commence cleaning dataset:")
            logger.info("DataFrame rules:")
            logger.info(json.dumps(df_rules, cls=TypeEncoder))
            logger.info("Column rules:")
            logger.info(json.dumps(column_rules, cls=TypeEncoder))

            return df_validator.clean()
        else:
            dfs = []
            for argset in parser_args:
                with ParseErrorHandler(argset, self) as handler:
                    _df = handler.load()
                df_validator = DataFrameValidator(data=_df,
                                                  column_rules=column_rules)
                dfs.append(df_validator.clean())
            df = pd.concat(dfs, axis=0)
            return df.set_index(np.arange(df.shape[0]))