Beispiel #1
0
 def test_relpath(self):
     """Test if specifying datapaths relative to schema workds."""
     df = pd.DataFrame(np.random.randint(low=1, high=10, size=(10, 2)),
                       columns="a b".split())
     schema = {'data': {"dataframe_rules": {"drop_duplicates": False}}}
     with DummyProjectFactory(schema, df) as project:
         loaded = project.load_dataset("data")
         self.assertDataFrameEqual(loaded, df)
Beispiel #2
0
 def test_exclude_cols(self):
     """Test if importing data with excluded columns works."""
     filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
                        "iris.csv")
     df = pd.read_csv(filepath)
     specs = {"data": {'exclude_columns': ['Species']}}
     with DummyProjectFactory(specs, df) as project:
         loaded = project.load_dataset("data")
         self.assertNotIn('Species', loaded.columns)
Beispiel #3
0
 def test_column_postprocessors(self):
     """Test if postprocessors work on column data properly."""
     filepath = op.join(op.abspath(op.dirname(__file__)), "testdata",
                        "iris.csv")
     df = pd.read_csv(filepath)
     col_rules = {'Species': {'postprocessors': [_dummy_postproc]}}
     schema = {"data": {'column_rules': col_rules}}
     with DummyProjectFactory(schema, df) as project:
         loaded = project.load_dataset("data")
         processed = loaded['Species']
         self.assertNotIn("setosa", processed.unique())
Beispiel #4
0
    def test_integer_col_na_values(self):
        """Test if the Loader can load columns with integers and NAs.

        This is necessary because NaNs cannot be represented by integers."""
        x = map(str, range(20))
        x[13] = ""
        df = pd.DataFrame.from_dict(dict(a=x, b=x))
        specs = dict(delimiter=',', dtypes={'a': int, 'b': int})
        schema = dict(data=specs)
        with DummyProjectFactory(schema, df) as project:
            df = project.load_dataset("data")
            self.assertEqual(df['a'].dtype, float)
            self.assertEqual(df['b'].dtype, float)
Beispiel #5
0
 def test_min_dropna_on_cols(self):
     """Test if specifying a minimum value for a column also drops the NaNs."""
     x1 = np.random.rand(10, 2) / 10
     x2 = np.random.rand(10, 2)
     x = np.r_[x1, x2]
     np.random.shuffle(x)
     df = pd.DataFrame(x, columns="col_a col_b".split())
     df.loc[3, "col_a"] = np.nan
     df.loc[7, "col_b"] = np.nan
     schema = {'data': {"column_rules": {"col_a": {"min": 0.1}}}}
     with DummyProjectFactory(schema, df) as project:
         loaded = project.load_dataset("data")
         self.assertFalse(np.any(pd.isnull(loaded)))
         self.assertGreater(loaded['col_a'].min(), 0.1)
Beispiel #6
0
 def test_index_col(self):
     """Test if specifying the index_col works."""
     df = pd.read_csv(self.expected_specs['iris']['filepath_or_buffer'])
     specs = {
         "data": {
             'index_col': 'Species',
             'dataframe_rules': {
                 'drop_duplicates': False
             }
         }
     }
     with DummyProjectFactory(specs, df) as project:
         df = project.load_dataset('data')
         for specie in ['setosa', 'versicolor', 'virginica']:
             self.assertEqual(df.ix[specie].shape[0], 50)
Beispiel #7
0
 def test_indexcol_not_in_usecols(self):
     """
     Test if the specified index column is added to the usecols
     argument."""
     df = pd.read_csv(self.data_specs['iris']['path'])
     schema = {
         'data': {
             'index_col': 'Species',
             'use_columns': ['Sepal Length', 'Petal Width']
         }
     }
     with DummyProjectFactory(schema, df) as project:
         df = project.load_dataset("data")
         self.assertEqual(df.index.name, "Species")
         self.assertItemsEqual(df.columns, ['Sepal Length', 'Petal Width'])
Beispiel #8
0
 def test_multiindex(self):
     """Test if providing a list of indices in the schema returns a proper
     multiindexed dataframe."""
     orgdf = pd.read_table(
         self.expected_specs['person_activity']['filepath_or_buffer'])
     index_cols = ['sequence_name', 'tag']
     schema = {"data": {'index_col': index_cols, 'delimiter': '\t'}}
     with DummyProjectFactory(schema, orgdf, sep="\t") as project:
         df = project.load_dataset('data')
         self.assertTrue(isinstance(df.index, pd.MultiIndex))
         self.assertEqual(len(df.index.levels), 2)
         seq_name, tags = df.index.levels
         for col in index_cols:
             x = orgdf[col].unique().tolist()
             y = df.index.get_level_values(col).unique().tolist()
             self.assertItemsEqual(x, y)
Beispiel #9
0
 def test_load_dataset_wrong_dtypes_in_spec(self):
     """Test if the Loader can safely load columns that have a wrongly
     specified data type in the schema.
     """
     # Make a file with two columns, both specified as integers in the
     # dtypes, but one has random string types.
     x = np.random.randint(0, 10, size=(100, 2))
     dframe = pd.DataFrame(x, columns=['a', 'b'])
     _ix = np.random.randint(0, 100, size=(5, ))
     dframe['b'][_ix] = "aa"
     specs = dict(delimiter=',', dtypes={'a': int, 'b': int})
     schema = dict(data=specs)
     with DummyProjectFactory(schema, dframe) as project:
         with warnings.catch_warnings(record=True) as catcher:
             dframe = project.load_dataset("data")
             assert len(catcher) == 1
             assert issubclass(catcher[-1].category, UserWarning)
Beispiel #10
0
 def test_min_nan(self):
     """Test if the minimum rules work when data contains NaNs."""
     s = pd.Series(np.random.rand(10, ))
     s.loc[3] = np.nan
     schema = {
         'data': {
             'header': None,
             'column_rules': {
                 '0': {
                     'min': 0.2
                 }
             }
         }
     }
     with DummyProjectFactory(schema, s) as project:
         df = project.load_dataset("data")
         self.assertFalse(np.any(pd.isnull(df[0])))
Beispiel #11
0
 def test_global_na_reps(self):
     """Test is specifying a global NA value for a dataset works."""
     df = pd.DataFrame(np.random.rand(10, 10))
     ix = np.random.randint(0, df.shape[0], size=(5, ))
     ix = np.unique(ix)
     for i in xrange(ix.shape[0]):
         df.iloc[ix[i], ix[i]] = "foobar"
     schema = {
         "data": {
             'na_values': "foobar",
             'dataframe_rules': {
                 'drop_na': False,
                 'drop_duplicates': False
             }
         }
     }
     with DummyProjectFactory(schema, df) as project:
         df = project.load_dataset("data")
         self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0])
Beispiel #12
0
 def test_na_reps_list(self):
     """Test if NA values work when specified as a list."""
     df = pd.DataFrame(np.random.rand(10, 2))
     ix = np.random.randint(0, df.shape[0], size=(5, ))
     ix = np.unique(ix)
     df.iloc[ix, 0] = "foo"
     df.iloc[ix, 1] = "bar"
     schema = {
         "data": {
             'na_values': ["foo", "bar"],
             'dataframe_rules': {
                 'drop_na': False,
                 'drop_duplicates': False
             }
         }
     }
     with DummyProjectFactory(schema, df) as project:
         df = project.load_dataset("data")
         self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0] * 2)
Beispiel #13
0
 def test_nrows_shuffling(self):
     """Test if the shuffle parameter works with the nrows parameter."""
     X = np.c_[np.arange(10), np.arange(10)]
     ix = range(5) + "a b c d e".split()
     df = pd.DataFrame(X, index=ix)
     schema = {
         'data': {
             "index_col": "index",
             'nrows': {
                 'count': 5,
                 "shuffle": True
             }
         }
     }
     with DummyProjectFactory(schema, df, index_label="index") as project:
         df = project.load_dataset("data")
         for row_label in "a b c d e".split():
             self.assertNotIn(row_label, df.index)
         self.assertFalse(np.all(df.index == range(5)))
Beispiel #14
0
 def test_index_column_rules(self):
     """Test if column rules specified for index columns are enforced."""
     schema = {
         'data': {
             'index_col': 'Species',
             'dataframe_rules': {
                 'drop_duplicates': False
             },
             'column_rules': {
                 'Species': {
                     'regex': '.*e.*'
                 }
             }
         }
     }
     df = pd.read_csv(self.data_specs['iris']['path'])
     with DummyProjectFactory(schema, df) as project:
         df = project.load_dataset("data")
         self.assertEqual(df.index.name.lower(), 'species')
         self.assertNotIn("virginica", df.index.unique())
Beispiel #15
0
 def test_min_max_datetime(self):
     """Test if minmax rules work on datetime columns."""
     dates = pd.date_range("01/01/2015", "12/31/2015")
     x = np.random.rand(365, )
     df = pd.DataFrame.from_dict(dict(day=dates, data=x))
     schema = {
         'data': {
             'parse_dates': 'day',
             'column_rules': {
                 'day': {
                     'min': '04/01/2015',
                     'max': "11/30/2015"
                 }
             }
         }
     }
     with DummyProjectFactory(schema, df) as project:
         newdf = project.load_dataset('data')
         self.assertEqual(newdf['day'].min(), pd.to_datetime("04/01/2015"))
         self.assertEqual(newdf['day'].max(), pd.to_datetime("11/30/2015"))
Beispiel #16
0
 def test_index_column_exclude(self):
     """Test if values are excluded from index column if so specified."""
     df = pd.DataFrame.from_dict({
         'index': np.arange(10),
         'col_a': np.arange(10)
     })
     schema = {
         'data': {
             'index_col': 'index',
             'column_rules': {
                 'index': {
                     'exclude': [1, 2]
                 }
             }
         }
     }
     with DummyProjectFactory(schema, df) as project:
         df = project.load_dataset("data")
         self.assertItemsEqual(df.shape, (8, 1))
         self.assertEqual(df.index.name, "index")
         self.assertNotIn(1, df.index)
         self.assertNotIn(2, df.index)