def test_integer_col_na_values(self): """Test if the loader can load columns with integers and NAs. This is necessary because NaNs cannot be represented by integers.""" x = map(str, range(20)) x[13] = "" df = pd.DataFrame.from_dict(dict(a=x, b=x)) tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "testdata.csv") df.to_csv(outfile, index=False) specfile = op.join(tempdir, "dict.yaml") specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile) with open(specfile, "w") as fileobj: yaml.dump({'testdata': specs}, fileobj, Dumper=yaml.CDumper, default_flow_style=False) pr.add_project("wrong_dtype", specfile) try: _pr = pr.Project("wrong_dtype") df = _pr.load_dataset("testdata") self.assertEqual(df['a'].dtype, float) self.assertEqual(df['b'].dtype, float) finally: pr.remove_project("wrong_dtype") shutil.rmtree(tempdir)
def test_load_dataset_wrong_dtypes_in_spec(self): """Test if the Loader can safely load columns that have a wrongly specified data type in the schema. """ # Make a file with two columns, both specified as integers in the # dtypes, but one has random string types. x = np.random.randint(0, 10, size=(100, 2)) dframe = pd.DataFrame(x, columns=['a', 'b']) tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "testdata.csv") _ix = np.random.randint(0, 100, size=(5,)) dframe['b'][_ix] = "aa" dframe.to_csv(outfile, index=False) specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile) specfile = op.join(tempdir, "dict.yaml") with open(specfile, "w") as fileobj: yaml.dump({'testdata': specs}, fileobj, Dumper=Dumper, default_flow_style=False) pr.add_project("wrong_dtype", specfile) try: _pr = pr.Project("wrong_dtype") with warnings.catch_warnings(record=True) as catcher: dframe = _pr.load_dataset("testdata") assert len(catcher) == 1 assert issubclass(catcher[-1].category, UserWarning) finally: pr.remove_project("wrong_dtype") shutil.rmtree(tempdir)
def test_load_dataset_wrong_dtypes_in_spec(self): """Test if the loader can safely load columns that have a wrongly specified data type in the schema. """ # Make a file with two columns, both specified as integers in the # dtypes, but one has random string types. x = np.random.randint(0, 10, size=(100, 2)) dframe = pd.DataFrame(x, columns=['a', 'b']) tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "testdata.csv") _ix = np.random.randint(0, 100, size=(5, )) dframe['b'][_ix] = "aa" dframe.to_csv(outfile, index=False) specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile) specfile = op.join(tempdir, "dict.yaml") with open(specfile, "w") as fileobj: yaml.dump({'testdata': specs}, fileobj, Dumper=yaml.CDumper, default_flow_style=False) pr.add_project("wrong_dtype", specfile) try: _pr = pr.Project("wrong_dtype") with warnings.catch_warnings(record=True) as catcher: dframe = _pr.load_dataset("testdata") assert len(catcher) == 1 assert issubclass(catcher[-1].category, UserWarning) finally: pr.remove_project("wrong_dtype") shutil.rmtree(tempdir)
def test_add(self): """Test if the `add` subcommand can add projects to the config file.""" try: cmd = ['semantic', 'add', 'dummy_added_project', '/tmp/dummy.yaml'] subprocess.check_call(cmd, env=self.testenv) projects = pr.get_projects() self.assertIn(("dummy_added_project", "/tmp/dummy.yaml"), projects) finally: pr.remove_project("dummy_added_project")
def test_remove(self): """Test if the remove subcommand can remove projects.""" pr.add_project("dummy_project_2", "/foo/baz.yaml") try: cmd = ['semantic', 'remove', 'dummy_project_2'] subprocess.check_call(cmd, env=self.testenv) projects = pr.get_projects() proj_names = [p[0] for p in projects] self.assertNotIn("dummy_project_2", proj_names) finally: pr.remove_project("dummy_project_2")
def test_relative_path(self): """Check if the set-schema and add subcommands convert relative paths from the cmdline to absolute paths in the config file. """ try: cmd = ['semantic', 'set-schema', 'dummy_project', './foo.yaml'] subprocess.check_call(cmd, env=self.testenv) self.assertTrue(op.isabs(pr.get_default_specfile('dummy_project'))) pr.remove_project("dummy_project") cmd = ['semantic', 'add', 'dummy_project', './foo.yaml'] subprocess.check_call(cmd, env=self.testenv) self.assertTrue(op.isabs(pr.get_default_specfile('dummy_project'))) finally: pr.remove_project("dummy_project_1")
def _remove_project(project_name, project_files=None): pr.remove_project(project_name) if project_files is not None: if hasattr(project_files, "__iter__"): for path in project_files: if op.isfile(path): os.unlink(path) elif op.isdir(path): shutil.rmtree(path) else: if op.isfile(project_files): os.unlink(project_files) elif op.isdir(project_files): shutil.rmtree(project_files)
def test_relative_path(self): """Check if the set-schema and add subcommands convert relative paths from the cmdline to absolute paths in the config file. """ try: cmd = ['semantic', 'set-schema', 'dummy_project', './foo.yaml'] subprocess.check_call(cmd, env=self.testenv) self.assertTrue(op.isabs(pr.get_default_specfile( 'dummy_project'))) pr.remove_project("dummy_project") cmd = ['semantic', 'add', 'dummy_project', './foo.yaml'] subprocess.check_call(cmd, env=self.testenv) self.assertTrue(op.isabs(pr.get_default_specfile( 'dummy_project'))) finally: pr.remove_project("dummy_project_1")
def test_indexcol_not_in_usecols(self): """ Test if the specified index column is added to the usecols argument.""" schema = {'iris': {'path': self.data_specs['iris']['path'], 'index_col': 'Species', 'use_columns': ['Sepal Length', 'Petal Width']}} with tempfile.NamedTemporaryFile(delete=False) as f_schema: yaml.dump(schema, f_schema, Dumper=Dumper, default_flow_style=False) pr.add_project("testindex_usecols", f_schema.name) try: project = pr.Project("testindex_usecols") df = project.load_dataset("iris") self.assertEqual(df.index.name, "Species") self.assertItemsEqual(df.columns, ['Sepal Length', 'Petal Width']) finally: pr.remove_project("testindex_usecols") os.unlink(f_schema.name)
def test_index_column_rules(self): """Test if column rules specified for index columns are enforced.""" schema = {'iris': {'path': self.data_specs['iris']['path'], 'index_col': 'Species', 'dataframe_rules': {'drop_duplicates': False}, 'column_rules': {'Species': {'regex': '.*e.*'}}}} with tempfile.NamedTemporaryFile(delete=False) as f_schema: yaml.dump(schema, f_schema, Dumper=Dumper, default_flow_style=False) pr.add_project("index_col_rules", f_schema.name) try: project = pr.Project("index_col_rules") df = project.load_dataset("iris") self.assertEqual(df.index.name.lower(), 'species') self.assertNotIn("virginica", df.index.unique()) self.assertItemsEqual(df.shape, (100, 4)) finally: pr.remove_project("index_col_rules") os.unlink(f_schema.name)
def cli(arguments): """cli - The main CLI argument parser. :param arguments: command line arguments, as parsed by docopt :type arguments: dict :return: None """ if arguments.get("list", False): if arguments["--project"] is None: pr.view_projects() else: proj_name = arguments.get("--project") dataset_names = pr.get_datasets(proj_name) for name in dataset_names: print name elif arguments.get("add", False): proj_name = arguments.get("PROJECT_NAME") proj_spec = arguments.get("PROJECT_SPECFILE") proj_spec = op.abspath(proj_spec) pr.add_project(proj_name, proj_spec) elif arguments.get("remove", False): proj_name = arguments.get("PROJECT_NAME") if arguments["--dataset"] is None: if not pr.remove_project(proj_name): print "Removing the project {0} failed.".format(proj_name) else: pr.remove_dataset(proj_name, arguments["--dataset"]) elif arguments.get("set-schema", False): try: proj_name = arguments.get("PROJECT_NAME") proj_spec = arguments.get("SCHEMA_FPATH") proj_spec = op.abspath(proj_spec) pr.set_schema_fpath(proj_name, proj_spec) except MissingProject: msg = """Project {} not found in the configuration. Please use $ semantic add to register the project.""".format( arguments.get("PROJECT_NAME") ) print msg elif arguments.get("set-specs", False): proj_name = arguments.get("PROJECT_NAME") dataset_name = arguments.get("--dataset") newspecs = {} if arguments.get("--path", False): newspecs["path"] = arguments.get("--path") if arguments.get("--dlm", False): newspecs["delimiter"] = arguments.get("--dlm") pr.set_schema_specs(proj_name, dataset_name, **newspecs) elif arguments.get("add-dataset", False): proj_name = arguments.get("--project") dataset_name = arguments.get("DATASET_NAME") specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"]) pr.add_dataset(proj_name, dataset_name, specs) elif arguments.get("export", False): project = pr.Project(arguments.get("PROJECT_NAME")) project.export_dataset(arguments.get("--dataset"), outpath=arguments.get("OUTPATH"))
def test_invalid_literals(self): """Test if columns containing invalid literals are parsed safely.""" tempdir = tempfile.mkdtemp() schema_fpath = op.join(tempdir, "schema.yml") data_fpath = op.join(tempdir, "data.csv") data = pd.DataFrame.from_dict(dict(col_a=range(10))) data['col_b'] = ["x"] * 10 data.to_csv(data_fpath, index=False) schema = {'dataset': {'path': data_fpath, 'dtypes': {'col_a': int, 'col_b': int}}} with open(schema_fpath, "w") as fin: yaml.dump(schema, fin, Dumper=Dumper, default_flow_style=False) pr.add_project("invalid_literal", schema_fpath) try: pr.Project("invalid_literal").load_dataset('dataset') finally: shutil.rmtree(tempdir) pr.remove_project("invalid_literal")
def cli(arguments): """cli - The main CLI argument parser. :param arguments: command line arguments, as parsed by docopt :type arguments: dict :return: None """ if arguments.get("list", False): if arguments['--project'] is None: pr.view_projects() else: proj_name = arguments.get('--project') dataset_names = pr.get_datasets(proj_name) for name in dataset_names: print name elif arguments.get("add", False): proj_name = arguments.get("PROJECT_NAME") proj_spec = arguments.get("PROJECT_SPECFILE") proj_spec = op.abspath(proj_spec) pr.add_project(proj_name, proj_spec) elif arguments.get("remove", False): proj_name = arguments.get("PROJECT_NAME") if arguments['--dataset'] is None: if not pr.remove_project(proj_name): print "The project {0} doesn't exist.".format(proj_name) else: pr.remove_dataset(proj_name, arguments['--dataset']) elif arguments.get("set-schema", False): try: proj_name = arguments.get("PROJECT_NAME") proj_spec = arguments.get("SCHEMA_FPATH") proj_spec = op.abspath(proj_spec) pr.set_schema_fpath(proj_name, proj_spec) except MissingProject: msg = """Project {} not found in the configuration. Please use $ semantic add to register the project.""".format(arguments.get("PROJECT_NAME")) print msg elif arguments.get("set-specs", False): proj_name = arguments.get("PROJECT_NAME") dataset_name = arguments.get("--dataset") newspecs = {} if arguments.get("--path", False): newspecs['path'] = arguments.get("--path") if arguments.get("--dlm", False): newspecs['delimiter'] = arguments.get("--dlm") pr.set_schema_specs(proj_name, dataset_name, **newspecs) elif arguments.get("add-dataset", False): proj_name = arguments.get('--project') dataset_name = arguments.get("DATASET_NAME") specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"]) pr.add_dataset(proj_name, dataset_name, specs) elif arguments.get("export", False): project = pr.Project(arguments.get("PROJECT_NAME")) project.export_dataset(arguments.get("--dataset"), outpath=arguments.get("OUTPATH"))
def test_load_excel_multisheet(self): """Test combining multiple sheets into a single dataframe.""" tempdir = tempfile.mkdtemp() spreadsheet = op.join(tempdir, "multifile_iris.xlsx") iris = self.project.load_dataset("iris") with pd.ExcelWriter(spreadsheet) as writer: iris.to_excel(writer, "iris1", index=False) iris.to_excel(writer, "iris2", index=False) schema = {'iris': {'path': spreadsheet, 'sheetname': ['iris1', 'iris2'], 'dataframe_rules': {'drop_duplicates': False}}} schema_fpath = op.join(tempdir, "multi_iris.yaml") with open(schema_fpath, "w") as fout: yaml.dump(schema, fout, Dumper=Dumper, default_flow_style=False) pr.add_project("multi_iris", schema_fpath) try: ideal = pd.concat((iris, iris), axis=0) actual = pr.Project('multi_iris').load_dataset("iris") self.assertDataFrameEqual(ideal, actual) finally: pr.remove_project("multi_iris") shutil.rmtree(tempdir)
def test_relpath(self): """Test if specifying datapaths relative to schema workds.""" df = pd.DataFrame(np.random.randint(low=1, high=10, size=(10, 2)), columns="a b".split()) tempdir = tempfile.mkdtemp() data_dir = op.join(tempdir, "data") os.mkdir(data_dir) schema_fpath = op.join(tempdir, "schema.yml") data_fpath = op.join(data_dir, "data.csv") df.to_csv(data_fpath, index=False) schema = {'data': {'path': op.join("data", "data.csv"), "dataframe_rules": {"drop_duplicates": False}}} with open(schema_fpath, "w") as fin: yaml.dump(schema, fin, Dumper=Dumper, default_flow_style=False) pr.add_project("relpath", schema_fpath) try: loaded = pr.Project("relpath").load_dataset("data") self.assertDataFrameEqual(loaded, df) finally: pr.remove_project("relpath") shutil.rmtree(tempdir)
def test_index_column_exclude(self): """Test if values are excluded from index column if so specified.""" tempdir = tempfile.mkdtemp() schema_fpath = op.join(tempdir, "schema.yml") data_fpath = op.join(tempdir, "data.csv") df = pd.DataFrame.from_dict({'index': np.arange(10), 'col_a': np.arange(10)}) df.to_csv(data_fpath, index=False) schema = {'data': {'path': data_fpath, 'index_col': 'index', 'column_rules': {'index': {'exclude': [1, 2]}}}} with open(schema_fpath, "w") as fin: yaml.dump(schema, fin, Dumper=Dumper, default_flow_style=False) pr.add_project("index_exclude", schema_fpath) try: df = pr.Project("index_exclude").load_dataset("data") self.assertItemsEqual(df.shape, (8, 1)) self.assertEqual(df.index.name, "index") self.assertNotIn(1, df.index) self.assertNotIn(2, df.index) finally: pr.remove_project("index_exclude") shutil.rmtree(tempdir)
def test_error_bad_lines_correction(self): """test if the correction for bad lines works.""" tempdir = tempfile.mkdtemp() iris_path = op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv") with open(iris_path, "r") as fid: iris_lines = fid.readlines() outpath = op.join(tempdir, "bad_iris.csv") iris_lines[50] = iris_lines[50].rstrip() + ",0,23,\n" with open(outpath, 'w') as fid: fid.writelines(iris_lines) data_dict = op.join(tempdir, "dummy_project.yaml") specs = {'bad_iris': {'path': outpath}} with open(data_dict, "w") as fid: yaml.dump(specs, fid, Dumper=Dumper, default_flow_style=False) pr.add_project('dummy_project', data_dict) try: project = pr.Project('dummy_project') df = project.load_dataset('bad_iris') self.assertItemsEqual(df.shape, (146, 5)) finally: shutil.rmtree(tempdir) pr.remove_project('dummy_project')
def test_nrows_shuffling(self): """test_relpath""" """Test if the shuffle parameter works with the nrows parameter.""" tempdir = tempfile.mkdtemp() schema_fpath = op.join(tempdir, "schema.yml") data_fpath = op.join(tempdir, "data.csv") X = np.c_[np.arange(10), np.arange(10)] ix = range(5) + "a b c d e".split() df = pd.DataFrame(X, index=ix) df.to_csv(data_fpath, index_label="index") schema = {'data': {'path': data_fpath, "index_col": "index", 'nrows': {'count': 5, "shuffle": True}}} with open(schema_fpath, "w") as fin: yaml.dump(schema, fin, Dumper=Dumper, default_flow_style=False) pr.add_project("nrows_shuffle", schema_fpath) try: df = pr.Project("nrows_shuffle").load_dataset("data") for row_label in "a b c d e".split(): self.assertNotIn(row_label, df.index) self.assertFalse(np.all(df.index == range(5))) finally: pr.remove_project("nrows_shuffle") shutil.rmtree(tempdir)
def test_integer_col_na_values(self): """Test if the Loader can load columns with integers and NAs. This is necessary because NaNs cannot be represented by integers.""" x = map(str, range(20)) x[13] = "" df = pd.DataFrame.from_dict(dict(a=x, b=x)) tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "testdata.csv") df.to_csv(outfile, index=False) specfile = op.join(tempdir, "dict.yaml") specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile) with open(specfile, "w") as fileobj: yaml.dump({'testdata': specs}, fileobj, Dumper=Dumper, default_flow_style=False) pr.add_project("wrong_dtype", specfile) try: _pr = pr.Project("wrong_dtype") df = _pr.load_dataset("testdata") self.assertEqual(df['a'].dtype, float) self.assertEqual(df['b'].dtype, float) finally: pr.remove_project("wrong_dtype") shutil.rmtree(tempdir)
def test_global_na_reps(self): """Test is specifying a global NA value for a dataset works.""" tempdir = tempfile.mkdtemp() df = pd.DataFrame(np.random.rand(10, 10)) ix = np.random.randint(0, df.shape[0], size=(5,)) ix = np.unique(ix) for i in xrange(ix.shape[0]): df.iloc[ix[i], ix[i]] = "foobar" fpath = op.join(tempdir, "test_na.csv") df.to_csv(fpath, index=False) schema = {'path': fpath, 'na_values': "foobar", 'dataframe_rules': {'drop_na': False, 'drop_duplicates': False}} schema_fpath = op.join(tempdir, "test_na.yaml") with open(schema_fpath, "w") as fid: yaml.dump({'test_na': schema}, fid, Dumper=Dumper, default_flow_style=False) pr.add_project("test_na", schema_fpath) try: df = pr.Project("test_na").load_dataset("test_na") self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0]) finally: pr.remove_project("test_na") shutil.rmtree(tempdir)
def test_na_reps_list(self): """Test if NA values work when specified as a list.""" tempdir = tempfile.mkdtemp() df = pd.DataFrame(np.random.rand(10, 2)) ix = np.random.randint(0, df.shape[0], size=(5,)) ix = np.unique(ix) df.iloc[ix, 0] = "foo" df.iloc[ix, 1] = "bar" fpath = op.join(tempdir, "test_na.csv") df.to_csv(fpath, index=False) schema = {'path': fpath, 'na_values': ["foo", "bar"], 'dataframe_rules': {'drop_na': False, 'drop_duplicates': False}} schema_fpath = op.join(tempdir, "test_na.yaml") with open(schema_fpath, "w") as fid: yaml.dump({'test_na': schema}, fid, Dumper=Dumper, default_flow_style=False) pr.add_project("test_na", schema_fpath) try: df = pr.Project("test_na").load_dataset("test_na") self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0] * 2) finally: pr.remove_project("test_na") shutil.rmtree(tempdir)
def test_remove_project(self): """Test if removing a project works properly.""" self.assertTrue(pr.remove_project("test_project")) self.assertRaises(NoSectionError, pr.get_default_specfile, "test_project")
def tearDown(self): pr.remove_project("dummy_project")