def test_update_dataset(self): """Test if the update_dataset method works.""" tempdir = tempfile.mkdtemp() _pr = pr.Project("pysemantic") iris = _pr.load_dataset("iris") x = np.random.random((150, )) y = np.random.random((150, )) iris['x'] = x iris['y'] = y org_cols = iris.columns.tolist() outpath = op.join(tempdir, "iris.csv") with open(TEST_DATA_DICT, "r") as fid: org_specs = yaml.load(fid, Loader=Loader) try: _pr.update_dataset("iris", iris, path=outpath, sep='\t') _pr = pr.Project("pysemantic") iris = _pr.load_dataset("iris") self.assertItemsEqual(org_cols, iris.columns.tolist()) iris_validator = _pr.validators['iris'] updated_args = iris_validator.parser_args self.assertEqual(updated_args['dtype']['x'], float) self.assertEqual(updated_args['dtype']['y'], float) self.assertEqual(updated_args['sep'], '\t') self.assertEqual(updated_args['filepath_or_buffer'], outpath) finally: shutil.rmtree(tempdir) with open(TEST_DATA_DICT, "w") as fid: yaml.dump(org_specs, fid, Dumper=Dumper, default_flow_style=False)
def test_load_dataset_missing_nrows(self): """Test if the project loads datasets properly if the nrows parameter is not provided in the schema. """ # Modify the schema to remove the nrows with open(TEST_DATA_DICT, "r") as fileobj: org_specs = yaml.load(fileobj, Loader=Loader) new_specs = deepcopy(org_specs) for dataset_specs in new_specs.itervalues(): if "nrows" in dataset_specs: del dataset_specs['nrows'] with open(TEST_DATA_DICT, "w") as fileobj: yaml.dump(new_specs, fileobj, Dumper=Dumper, default_flow_style=False) try: _pr = pr.Project("pysemantic") dframe = pd.read_csv(**self.expected_specs['iris']) loaded = _pr.load_dataset("iris") self.assertDataFrameEqual(dframe, loaded) dframe = pd.read_table(**self.expected_specs['person_activity']) loaded = _pr.load_dataset("person_activity") self.assertDataFrameEqual(loaded, dframe) finally: with open(TEST_DATA_DICT, "w") as fileobj: yaml.dump(org_specs, fileobj, Dumper=Dumper, default_flow_style=False)
def setUp(self): iris_specs = _get_iris_args() copied_iris_specs = deepcopy(iris_specs) copied_iris_specs.update({ 'filepath_or_buffer': iris_specs['filepath_or_buffer'].replace("iris", "iris2") }) multi_iris_specs = [iris_specs, copied_iris_specs] person_activity_specs = _get_person_activity_args() random_row_iris_specs = { 'nrows': { 'random': True, 'count': 50 }, 'error_bad_lines': False, 'filepath_or_buffer': op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv") } expected = { 'iris': iris_specs, 'person_activity': person_activity_specs, 'multi_iris': multi_iris_specs, 'random_row_iris': random_row_iris_specs } self.expected_specs = expected self.project = pr.Project(project_name="pysemantic")
def test_load_excel_multisheet(self): """Test combining multiple sheets into a single dataframe.""" tempdir = tempfile.mkdtemp() spreadsheet = op.join(tempdir, "multifile_iris.xlsx") iris = self.project.load_dataset("iris") with pd.ExcelWriter(spreadsheet) as writer: iris.to_excel(writer, "iris1", index=False) iris.to_excel(writer, "iris2", index=False) schema = { 'iris': { 'path': spreadsheet, 'sheetname': ['iris1', 'iris2'], 'dataframe_rules': { 'drop_duplicates': False } } } schema_fpath = op.join(tempdir, "multi_iris.yaml") with open(schema_fpath, "w") as fout: yaml.dump(schema, fout, Dumper=Dumper, default_flow_style=False) pr.add_project("multi_iris", schema_fpath) try: ideal = pd.concat((iris, iris), axis=0) actual = pr.Project('multi_iris').load_dataset("iris") self.assertDataFrameEqual(ideal, actual) finally: _remove_project("multi_iris", tempdir)
def test_integer_col_na_values(self): """Test if the loader can load columns with integers and NAs. This is necessary because NaNs cannot be represented by integers.""" x = map(str, range(20)) x[13] = "" df = pd.DataFrame.from_dict(dict(a=x, b=x)) tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "testdata.csv") df.to_csv(outfile, index=False) specfile = op.join(tempdir, "dict.yaml") specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile) with open(specfile, "w") as fileobj: yaml.dump({'testdata': specs}, fileobj, Dumper=yaml.CDumper, default_flow_style=False) pr.add_project("wrong_dtype", specfile) try: _pr = pr.Project("wrong_dtype") df = _pr.load_dataset("testdata") self.assertEqual(df['a'].dtype, float) self.assertEqual(df['b'].dtype, float) finally: pr.remove_project("wrong_dtype") shutil.rmtree(tempdir)
def test_load_dataset_wrong_dtypes_in_spec(self): """Test if the loader can safely load columns that have a wrongly specified data type in the schema. """ # Make a file with two columns, both specified as integers in the # dtypes, but one has random string types. x = np.random.randint(0, 10, size=(100, 2)) dframe = pd.DataFrame(x, columns=['a', 'b']) tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "testdata.csv") _ix = np.random.randint(0, 100, size=(5, )) dframe['b'][_ix] = "aa" dframe.to_csv(outfile, index=False) specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile) specfile = op.join(tempdir, "dict.yaml") with open(specfile, "w") as fileobj: yaml.dump({'testdata': specs}, fileobj, Dumper=yaml.CDumper, default_flow_style=False) pr.add_project("wrong_dtype", specfile) try: _pr = pr.Project("wrong_dtype") with warnings.catch_warnings(record=True) as catcher: dframe = _pr.load_dataset("testdata") assert len(catcher) == 1 assert issubclass(catcher[-1].category, UserWarning) finally: pr.remove_project("wrong_dtype") shutil.rmtree(tempdir)
def test_random_row_selection(self): iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = dict(random=True, count=50) project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 50) ideal_ix = np.arange(50) self.assertFalse(np.all(loaded.index.values == ideal_ix))
def cli(arguments): """cli - The main CLI argument parser. :param arguments: command line arguments, as parsed by docopt :type arguments: dict :return: None """ if arguments.get("list", False): if arguments['--project'] is None: pr.view_projects() else: proj_name = arguments.get('--project') dataset_names = pr.get_datasets(proj_name) for name in dataset_names: print name elif arguments.get("add", False): proj_name = arguments.get("PROJECT_NAME") proj_spec = arguments.get("PROJECT_SPECFILE") proj_spec = op.abspath(proj_spec) pr.add_project(proj_name, proj_spec) elif arguments.get("remove", False): proj_name = arguments.get("PROJECT_NAME") if arguments['--dataset'] is None: if not pr.remove_project(proj_name): print "The project {0} doesn't exist.".format(proj_name) else: pr.remove_dataset(proj_name, arguments['--dataset']) elif arguments.get("set-schema", False): try: proj_name = arguments.get("PROJECT_NAME") proj_spec = arguments.get("SCHEMA_FPATH") proj_spec = op.abspath(proj_spec) pr.set_schema_fpath(proj_name, proj_spec) except MissingProject: msg = """Project {} not found in the configuration. Please use $ semantic add to register the project.""".format(arguments.get("PROJECT_NAME")) print msg elif arguments.get("set-specs", False): proj_name = arguments.get("PROJECT_NAME") dataset_name = arguments.get("--dataset") newspecs = {} if arguments.get("--path", False): newspecs['path'] = arguments.get("--path") if arguments.get("--dlm", False): newspecs['delimiter'] = arguments.get("--dlm") pr.set_schema_specs(proj_name, dataset_name, **newspecs) elif arguments.get("add-dataset", False): proj_name = arguments.get('--project') dataset_name = arguments.get("DATASET_NAME") specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"]) pr.add_dataset(proj_name, dataset_name, specs) elif arguments.get("export", False): project = pr.Project(arguments.get("PROJECT_NAME")) project.export_dataset(arguments.get("--dataset"), outpath=arguments.get("OUTPATH"))
def test_nrows_callable(self): """Check if specifying the nrows argument as a callable works.""" nrows = lambda x: np.remainder(x, 2) == 0 iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = nrows project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 75) ideal_ix = np.arange(150, step=2) np.testing.assert_allclose(ideal_ix, loaded.index.values)
def test_row_selection_random_range(self): """Check if a range of rows can be selected from the dataset.""" iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = {'range': [25, 75], 'random': True} iris_specs['header'] = 0 del iris_specs['dtypes'] iris_specs['column_names'] = colnames(iris_specs['path']) project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 50) ideal_ix = np.arange(50) self.assertFalse(np.all(loaded.index.values == ideal_ix))
def test_random_row_selection_within_range(self): """Check if randomly selecting rows within a range works.""" iris_specs = pr.get_schema_specs("pysemantic", "iris") iris_specs['nrows'] = {'range': [25, 75], 'count': 10, 'random': True} iris_specs['header'] = 0 del iris_specs['dtypes'] iris_specs['column_names'] = colnames(iris_specs['path']) project = pr.Project(schema={'iris': iris_specs}) loaded = project.load_dataset('iris') self.assertEqual(loaded.shape[0], 10) ix = loaded.index.values self.assertTrue(ix.max() <= 50)
def setUp(self): iris_specs = { 'sep': ',', 'dtype': { 'Petal Length': float, 'Sepal Width': float, 'Petal Width': float, 'Sepal Length': float, 'Species': str }, 'usecols': [ 'Petal Length', 'Sepal Length', 'Sepal Width', 'Petal Width', 'Species' ], 'nrows': 150, 'filepath_or_buffer': op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv") } copied_iris_specs = deepcopy(iris_specs) copied_iris_specs.update({ 'filepath_or_buffer': iris_specs['filepath_or_buffer'].replace("iris", "iris2") }) multi_iris_specs = [iris_specs, copied_iris_specs] person_activity_specs = { 'sep': '\t', 'dtype': { 'activity': str, 'sequence_name': str, 'tag': str, 'x': float, 'y': float, 'z': float, }, 'usecols': ['activity', 'sequence_name', 'tag', 'x', 'y', 'z', 'date'], 'parse_dates': ['date'], 'nrows': 100, 'filepath_or_buffer': op.join(op.abspath(op.dirname(__file__)), "testdata", "person_activity.tsv") } expected = { 'iris': iris_specs, 'person_activity': person_activity_specs, 'multi_iris': multi_iris_specs } self.expected_specs = expected self.project = pr.Project(project_name="pysemantic")
def test_export_dataset_csv(self): """Test if the default csv exporter works.""" tempdir = tempfile.mkdtemp() project = pr.Project("pysemantic") try: dataset = "iris" outpath = op.join(tempdir, dataset + ".csv") project.export_dataset(dataset, outpath=outpath) self.assertTrue(op.exists(outpath)) loaded = pd.read_csv(outpath) self.assertDataFrameEqual(loaded, project.load_dataset(dataset)) finally: shutil.rmtree(tempdir)
def test_update_dataset_deleted_columns(self): """Test if the update dataset method removes column specifications.""" tempdir = tempfile.mkdtemp() _pr = pr.Project("pysemantic") iris = _pr.load_dataset("iris") outpath = op.join(tempdir, "iris.csv") with open(TEST_DATA_DICT, "r") as fid: org_specs = yaml.load(fid, Loader=Loader) try: del iris['Species'] _pr.update_dataset("iris", iris, path=outpath) pr_reloaded = pr.Project("pysemantic") iris_reloaded = pr_reloaded.load_dataset("iris") self.assertNotIn("Species", iris_reloaded.columns) self.assertNotIn("Species", pr_reloaded.column_rules["iris"]) finally: shutil.rmtree(tempdir) with open(TEST_DATA_DICT, "w") as fid: yaml.dump(org_specs, fid, Dumper=Dumper, default_flow_style=False)
def test_export_dataset_hdf(self): """Test if exporting the dataset to hdf works.""" tempdir = tempfile.mkdtemp() project = pr.Project("pysemantic") try: for dataset in project.datasets: if dataset not in ("bad_iris", "random_row_iris"): outpath = op.join(tempdir, dataset + ".h5") project.export_dataset(dataset, outpath=outpath) self.assertTrue(op.exists(outpath)) group = r'/{0}/{1}'.format(project.project_name, dataset) loaded = pd.read_hdf(outpath, group) self.assertDataFrameEqual(loaded, project.load_dataset(dataset)) finally: shutil.rmtree(tempdir)
def test_add_dataset(self): """Test if the add-dataset subcommand adds datasets to projects.""" tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "testdata.csv") dframe = pd.DataFrame(np.random.random((10, 2)), columns=['a', 'b']) dframe.to_csv(outfile, index=False) cmd = ("semantic add-dataset testdata --project pysemantic --path {}" " --dlm ,") cmd = cmd.format(outfile).split(" ") try: subprocess.check_call(cmd, env=self.testenv) _pr = pr.Project("pysemantic") self.assertIn("testdata", _pr.datasets) specs = dict(path=outfile, delimiter=',') actual = pr.get_schema_specs("pysemantic", "testdata") self.assertKwargsEqual(specs, actual) finally: pr.remove_dataset("pysemantic", "testdata") shutil.rmtree(tempdir)
def test_init_project_yaml_dump(self): """Test initialization of Project class with the raw yaml dump.""" project_specs = pr.get_schema_specs('pysemantic') project = pr.Project(schema=project_specs) loaded = project.load_datasets() self.assertItemsEqual(loaded.keys(), ('iris', 'person_activity', 'multi_iris', 'bad_iris', 'random_row_iris')) dframe = pd.read_csv(**self.expected_specs['iris']) self.assertDataFrameEqual(loaded['iris'], dframe) dframe = pd.read_csv(**self.expected_specs['person_activity']) self.assertDataFrameEqual(loaded['person_activity'], dframe) dframes = [ pd.read_csv(**args) for args in self.expected_specs['multi_iris'] ] dframes = [x.drop_duplicates() for x in dframes] dframe = pd.concat(dframes) dframe.set_index(np.arange(dframe.shape[0]), inplace=True) self.assertDataFrameEqual(loaded['multi_iris'], dframe)
def test_regex_separator(self): """Test if the project properly loads a dataset when it encounters regex separators. """ tempdir = tempfile.mkdtemp() outfile = op.join(tempdir, "sample.txt") data = ["col1"] + map(str, range(10)) with open(outfile, "w") as fileobj: fileobj.write("\n".join(data)) specs = dict(path=outfile, delimiter=r'\n', dtypes={'col1': int}) pr.add_dataset("pysemantic", "sample_dataset", specs) try: _pr = pr.Project("pysemantic") with warnings.catch_warnings(record=True) as catcher: dframe = _pr.load_dataset("sample_dataset") assert len(catcher) == 2 assert issubclass(catcher[1].category, ParserWarning) data.remove("col1") self.assertItemsEqual(map(int, data), dframe['col1'].tolist()) finally: pr.remove_dataset("pysemantic", "sample_dataset") shutil.rmtree(tempdir)
def test_reload_data_dict(self): """Test if the reload_data_dict method works.""" project = pr.Project("pysemantic") tempdir = tempfile.mkdtemp() datapath = op.join(tempdir, "data.csv") ideal = pd.DataFrame(np.random.randint(0, 9, size=(10, 5)), columns=map(str, range(5))) ideal.to_csv(datapath, index=False) with open(TEST_DATA_DICT, "r") as fid: specs = yaml.load(fid, Loader=Loader) specs['fakedata'] = dict(path=datapath) with open(TEST_DATA_DICT, "w") as fid: yaml.dump(specs, fid, Dumper=Dumper) try: project.reload_data_dict() actual = project.load_dataset("fakedata") self.assertDataFrameEqual(ideal, actual) finally: shutil.rmtree(tempdir) del specs['fakedata'] with open(TEST_DATA_DICT, "w") as fid: yaml.dump(specs, fid, Dumper=Dumper)
def test_error_bad_lines_correction(self): """test if the correction for bad lines works.""" iris_path = op.join(op.abspath(op.dirname(__file__)), "testdata", "iris.csv") with open(iris_path, "r") as fid: iris_lines = fid.readlines() tempdir = tempfile.mkdtemp() outpath = op.join(tempdir, "bad_iris.csv") iris_lines[50] = iris_lines[50].rstrip() + ",0,23,\n" with open(outpath, 'w') as fid: fid.writelines(iris_lines) data_dict = op.join(tempdir, "dummy_project.yaml") specs = {'bad_iris': {'path': outpath}} with open(data_dict, "w") as fid: yaml.dump(specs, fid, Dumper=Dumper, default_flow_style=False) pr.add_project('dummy_project', data_dict) try: project = pr.Project('dummy_project') df = project.load_dataset('bad_iris') self.assertItemsEqual(df.shape, (147, 5)) finally: _remove_project("dummy_project", tempdir)
def __enter__(self): pr.add_project("dummy_project", self.schema_fpath) return pr.Project("dummy_project")
def test_load_excel_sheetname(self): """Test if specifying the sheetname loads the correct dataframe.""" xl_project = pr.Project("test_excel") ideal_iris = self.project.load_dataset("iris") actual_iris = xl_project.load_dataset("iris_renamed") self.assertDataFrameEqual(ideal_iris, actual_iris)
def test_load_excel(self): """Test if excel spreadsheets are read properly from the schema.""" xl_project = pr.Project("test_excel") ideal_iris = self.project.load_dataset("iris") actual_iris = xl_project.load_dataset("iris") self.assertDataFrameEqual(ideal_iris, actual_iris)
def test_na_reps(self): """Test if the NA representations are parsed properly.""" project = pr.Project("pysemantic") loaded = project.load_dataset("bad_iris") self.assertItemsEqual(loaded.shape, (300, 5))