def test_integer_col_na_values(self):
        """Test if the loader can load columns with integers and NAs.

        This is necessary because NaNs cannot be represented by integers."""
        x = map(str, range(20))
        x[13] = ""
        df = pd.DataFrame.from_dict(dict(a=x, b=x))
        tempdir = tempfile.mkdtemp()
        outfile = op.join(tempdir, "testdata.csv")
        df.to_csv(outfile, index=False)
        specfile = op.join(tempdir, "dict.yaml")
        specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile)
        with open(specfile, "w") as fileobj:
            yaml.dump({'testdata': specs},
                      fileobj,
                      Dumper=yaml.CDumper,
                      default_flow_style=False)
        pr.add_project("wrong_dtype", specfile)
        try:
            _pr = pr.Project("wrong_dtype")
            df = _pr.load_dataset("testdata")
            self.assertEqual(df['a'].dtype, float)
            self.assertEqual(df['b'].dtype, float)
        finally:
            pr.remove_project("wrong_dtype")
            shutil.rmtree(tempdir)
 def test_load_dataset_wrong_dtypes_in_spec(self):
     """Test if the Loader can safely load columns that have a wrongly
     specified data type in the schema.
     """
     # Make a file with two columns, both specified as integers in the
     # dtypes, but one has random string types.
     x = np.random.randint(0, 10, size=(100, 2))
     dframe = pd.DataFrame(x, columns=['a', 'b'])
     tempdir = tempfile.mkdtemp()
     outfile = op.join(tempdir, "testdata.csv")
     _ix = np.random.randint(0, 100, size=(5,))
     dframe['b'][_ix] = "aa"
     dframe.to_csv(outfile, index=False)
     specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile)
     specfile = op.join(tempdir, "dict.yaml")
     with open(specfile, "w") as fileobj:
         yaml.dump({'testdata': specs}, fileobj, Dumper=Dumper,
                   default_flow_style=False)
     pr.add_project("wrong_dtype", specfile)
     try:
         _pr = pr.Project("wrong_dtype")
         with warnings.catch_warnings(record=True) as catcher:
             dframe = _pr.load_dataset("testdata")
             assert len(catcher) == 1
             assert issubclass(catcher[-1].category, UserWarning)
     finally:
         pr.remove_project("wrong_dtype")
         shutil.rmtree(tempdir)
 def test_load_dataset_wrong_dtypes_in_spec(self):
     """Test if the loader can safely load columns that have a wrongly
     specified data type in the schema.
     """
     # Make a file with two columns, both specified as integers in the
     # dtypes, but one has random string types.
     x = np.random.randint(0, 10, size=(100, 2))
     dframe = pd.DataFrame(x, columns=['a', 'b'])
     tempdir = tempfile.mkdtemp()
     outfile = op.join(tempdir, "testdata.csv")
     _ix = np.random.randint(0, 100, size=(5, ))
     dframe['b'][_ix] = "aa"
     dframe.to_csv(outfile, index=False)
     specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile)
     specfile = op.join(tempdir, "dict.yaml")
     with open(specfile, "w") as fileobj:
         yaml.dump({'testdata': specs},
                   fileobj,
                   Dumper=yaml.CDumper,
                   default_flow_style=False)
     pr.add_project("wrong_dtype", specfile)
     try:
         _pr = pr.Project("wrong_dtype")
         with warnings.catch_warnings(record=True) as catcher:
             dframe = _pr.load_dataset("testdata")
             assert len(catcher) == 1
             assert issubclass(catcher[-1].category, UserWarning)
     finally:
         pr.remove_project("wrong_dtype")
         shutil.rmtree(tempdir)
Example #4
0
 def test_add(self):
     """Test if the `add` subcommand can add projects to the config file."""
     try:
         cmd = ['semantic', 'add', 'dummy_added_project', '/tmp/dummy.yaml']
         subprocess.check_call(cmd, env=self.testenv)
         projects = pr.get_projects()
         self.assertIn(("dummy_added_project", "/tmp/dummy.yaml"), projects)
     finally:
         pr.remove_project("dummy_added_project")
Example #5
0
 def test_add(self):
     """Test if the `add` subcommand can add projects to the config file."""
     try:
         cmd = ['semantic', 'add', 'dummy_added_project', '/tmp/dummy.yaml']
         subprocess.check_call(cmd, env=self.testenv)
         projects = pr.get_projects()
         self.assertIn(("dummy_added_project", "/tmp/dummy.yaml"), projects)
     finally:
         pr.remove_project("dummy_added_project")
Example #6
0
 def test_remove(self):
     """Test if the remove subcommand can remove projects."""
     pr.add_project("dummy_project_2", "/foo/baz.yaml")
     try:
         cmd = ['semantic', 'remove', 'dummy_project_2']
         subprocess.check_call(cmd, env=self.testenv)
         projects = pr.get_projects()
         proj_names = [p[0] for p in projects]
         self.assertNotIn("dummy_project_2", proj_names)
     finally:
         pr.remove_project("dummy_project_2")
Example #7
0
 def test_remove(self):
     """Test if the remove subcommand can remove projects."""
     pr.add_project("dummy_project_2", "/foo/baz.yaml")
     try:
         cmd = ['semantic', 'remove', 'dummy_project_2']
         subprocess.check_call(cmd, env=self.testenv)
         projects = pr.get_projects()
         proj_names = [p[0] for p in projects]
         self.assertNotIn("dummy_project_2", proj_names)
     finally:
         pr.remove_project("dummy_project_2")
Example #8
0
 def test_relative_path(self):
     """Check if the set-schema and add subcommands convert relative paths
     from the cmdline to absolute paths in the config file.
     """
     try:
         cmd = ['semantic', 'set-schema', 'dummy_project', './foo.yaml']
         subprocess.check_call(cmd, env=self.testenv)
         self.assertTrue(op.isabs(pr.get_default_specfile('dummy_project')))
         pr.remove_project("dummy_project")
         cmd = ['semantic', 'add', 'dummy_project', './foo.yaml']
         subprocess.check_call(cmd, env=self.testenv)
         self.assertTrue(op.isabs(pr.get_default_specfile('dummy_project')))
     finally:
         pr.remove_project("dummy_project_1")
Example #9
0
def _remove_project(project_name, project_files=None):
    pr.remove_project(project_name)
    if project_files is not None:
        if hasattr(project_files, "__iter__"):
            for path in project_files:
                if op.isfile(path):
                    os.unlink(path)
                elif op.isdir(path):
                    shutil.rmtree(path)
        else:
            if op.isfile(project_files):
                os.unlink(project_files)
            elif op.isdir(project_files):
                shutil.rmtree(project_files)
Example #10
0
 def test_relative_path(self):
     """Check if the set-schema and add subcommands convert relative paths
     from the cmdline to absolute paths in the config file.
     """
     try:
         cmd = ['semantic', 'set-schema', 'dummy_project', './foo.yaml']
         subprocess.check_call(cmd, env=self.testenv)
         self.assertTrue(op.isabs(pr.get_default_specfile(
                                                          'dummy_project')))
         pr.remove_project("dummy_project")
         cmd = ['semantic', 'add', 'dummy_project', './foo.yaml']
         subprocess.check_call(cmd, env=self.testenv)
         self.assertTrue(op.isabs(pr.get_default_specfile(
                                                          'dummy_project')))
     finally:
         pr.remove_project("dummy_project_1")
 def test_indexcol_not_in_usecols(self):
     """
     Test if the specified index column is added to the usecols
     argument."""
     schema = {'iris': {'path': self.data_specs['iris']['path'],
                        'index_col': 'Species',
                        'use_columns': ['Sepal Length', 'Petal Width']}}
     with tempfile.NamedTemporaryFile(delete=False) as f_schema:
         yaml.dump(schema, f_schema, Dumper=Dumper, default_flow_style=False)
     pr.add_project("testindex_usecols", f_schema.name)
     try:
         project = pr.Project("testindex_usecols")
         df = project.load_dataset("iris")
         self.assertEqual(df.index.name, "Species")
         self.assertItemsEqual(df.columns, ['Sepal Length', 'Petal Width'])
     finally:
         pr.remove_project("testindex_usecols")
         os.unlink(f_schema.name)
 def test_index_column_rules(self):
     """Test if column rules specified for index columns are enforced."""
     schema = {'iris': {'path': self.data_specs['iris']['path'],
                        'index_col': 'Species',
                        'dataframe_rules': {'drop_duplicates': False},
                        'column_rules': {'Species': {'regex': '.*e.*'}}}}
     with tempfile.NamedTemporaryFile(delete=False) as f_schema:
         yaml.dump(schema, f_schema, Dumper=Dumper, default_flow_style=False)
     pr.add_project("index_col_rules", f_schema.name)
     try:
         project = pr.Project("index_col_rules")
         df = project.load_dataset("iris")
         self.assertEqual(df.index.name.lower(), 'species')
         self.assertNotIn("virginica", df.index.unique())
         self.assertItemsEqual(df.shape, (100, 4))
     finally:
         pr.remove_project("index_col_rules")
         os.unlink(f_schema.name)
Example #13
0
def cli(arguments):
    """cli - The main CLI argument parser.

    :param arguments: command line arguments, as parsed by docopt
    :type arguments: dict
    :return: None
    """
    if arguments.get("list", False):
        if arguments["--project"] is None:
            pr.view_projects()
        else:
            proj_name = arguments.get("--project")
            dataset_names = pr.get_datasets(proj_name)
            for name in dataset_names:
                print name
    elif arguments.get("add", False):
        proj_name = arguments.get("PROJECT_NAME")
        proj_spec = arguments.get("PROJECT_SPECFILE")
        proj_spec = op.abspath(proj_spec)
        pr.add_project(proj_name, proj_spec)
    elif arguments.get("remove", False):
        proj_name = arguments.get("PROJECT_NAME")
        if arguments["--dataset"] is None:
            if not pr.remove_project(proj_name):
                print "Removing the project {0} failed.".format(proj_name)
        else:
            pr.remove_dataset(proj_name, arguments["--dataset"])
    elif arguments.get("set-schema", False):
        try:
            proj_name = arguments.get("PROJECT_NAME")
            proj_spec = arguments.get("SCHEMA_FPATH")
            proj_spec = op.abspath(proj_spec)
            pr.set_schema_fpath(proj_name, proj_spec)
        except MissingProject:
            msg = """Project {} not found in the configuration. Please use
            $ semantic add
            to register the project.""".format(
                arguments.get("PROJECT_NAME")
            )
            print msg
    elif arguments.get("set-specs", False):
        proj_name = arguments.get("PROJECT_NAME")
        dataset_name = arguments.get("--dataset")
        newspecs = {}
        if arguments.get("--path", False):
            newspecs["path"] = arguments.get("--path")
        if arguments.get("--dlm", False):
            newspecs["delimiter"] = arguments.get("--dlm")
        pr.set_schema_specs(proj_name, dataset_name, **newspecs)
    elif arguments.get("add-dataset", False):
        proj_name = arguments.get("--project")
        dataset_name = arguments.get("DATASET_NAME")
        specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"])
        pr.add_dataset(proj_name, dataset_name, specs)
    elif arguments.get("export", False):
        project = pr.Project(arguments.get("PROJECT_NAME"))
        project.export_dataset(arguments.get("--dataset"), outpath=arguments.get("OUTPATH"))
 def test_invalid_literals(self):
     """Test if columns containing invalid literals are parsed safely."""
     tempdir = tempfile.mkdtemp()
     schema_fpath = op.join(tempdir, "schema.yml")
     data_fpath = op.join(tempdir, "data.csv")
     data = pd.DataFrame.from_dict(dict(col_a=range(10)))
     data['col_b'] = ["x"] * 10
     data.to_csv(data_fpath, index=False)
     schema = {'dataset': {'path': data_fpath, 'dtypes': {'col_a': int,
                                                          'col_b': int}}}
     with open(schema_fpath, "w") as fin:
         yaml.dump(schema, fin, Dumper=Dumper, default_flow_style=False)
     pr.add_project("invalid_literal", schema_fpath)
     try:
         pr.Project("invalid_literal").load_dataset('dataset')
     finally:
         shutil.rmtree(tempdir)
         pr.remove_project("invalid_literal")
Example #15
0
def cli(arguments):
    """cli - The main CLI argument parser.

    :param arguments: command line arguments, as parsed by docopt
    :type arguments: dict
    :return: None
    """
    if arguments.get("list", False):
        if arguments['--project'] is None:
            pr.view_projects()
        else:
            proj_name = arguments.get('--project')
            dataset_names = pr.get_datasets(proj_name)
            for name in dataset_names:
                print name
    elif arguments.get("add", False):
        proj_name = arguments.get("PROJECT_NAME")
        proj_spec = arguments.get("PROJECT_SPECFILE")
        proj_spec = op.abspath(proj_spec)
        pr.add_project(proj_name, proj_spec)
    elif arguments.get("remove", False):
        proj_name = arguments.get("PROJECT_NAME")
        if arguments['--dataset'] is None:
            if not pr.remove_project(proj_name):
                print "The project {0} doesn't exist.".format(proj_name)
        else:
            pr.remove_dataset(proj_name, arguments['--dataset'])
    elif arguments.get("set-schema", False):
        try:
            proj_name = arguments.get("PROJECT_NAME")
            proj_spec = arguments.get("SCHEMA_FPATH")
            proj_spec = op.abspath(proj_spec)
            pr.set_schema_fpath(proj_name, proj_spec)
        except MissingProject:
            msg = """Project {} not found in the configuration. Please use
            $ semantic add
            to register the project.""".format(arguments.get("PROJECT_NAME"))
            print msg
    elif arguments.get("set-specs", False):
        proj_name = arguments.get("PROJECT_NAME")
        dataset_name = arguments.get("--dataset")
        newspecs = {}
        if arguments.get("--path", False):
            newspecs['path'] = arguments.get("--path")
        if arguments.get("--dlm", False):
            newspecs['delimiter'] = arguments.get("--dlm")
        pr.set_schema_specs(proj_name, dataset_name, **newspecs)
    elif arguments.get("add-dataset", False):
        proj_name = arguments.get('--project')
        dataset_name = arguments.get("DATASET_NAME")
        specs = dict(path=arguments["--path"], delimiter=arguments["--dlm"])
        pr.add_dataset(proj_name, dataset_name, specs)
    elif arguments.get("export", False):
        project = pr.Project(arguments.get("PROJECT_NAME"))
        project.export_dataset(arguments.get("--dataset"),
                               outpath=arguments.get("OUTPATH"))
 def test_load_excel_multisheet(self):
     """Test combining multiple sheets into a single dataframe."""
     tempdir = tempfile.mkdtemp()
     spreadsheet = op.join(tempdir, "multifile_iris.xlsx")
     iris = self.project.load_dataset("iris")
     with pd.ExcelWriter(spreadsheet) as writer:
         iris.to_excel(writer, "iris1", index=False)
         iris.to_excel(writer, "iris2", index=False)
     schema = {'iris': {'path': spreadsheet, 'sheetname': ['iris1', 'iris2'],
                        'dataframe_rules': {'drop_duplicates': False}}}
     schema_fpath = op.join(tempdir, "multi_iris.yaml")
     with open(schema_fpath, "w") as fout:
         yaml.dump(schema, fout, Dumper=Dumper, default_flow_style=False)
     pr.add_project("multi_iris", schema_fpath)
     try:
         ideal = pd.concat((iris, iris), axis=0)
         actual = pr.Project('multi_iris').load_dataset("iris")
         self.assertDataFrameEqual(ideal, actual)
     finally:
         pr.remove_project("multi_iris")
         shutil.rmtree(tempdir)
Example #17
0
 def test_relpath(self):
     """Test if specifying datapaths relative to schema workds."""
     df = pd.DataFrame(np.random.randint(low=1, high=10, size=(10, 2)),
                       columns="a b".split())
     tempdir = tempfile.mkdtemp()
     data_dir = op.join(tempdir, "data")
     os.mkdir(data_dir)
     schema_fpath = op.join(tempdir, "schema.yml")
     data_fpath = op.join(data_dir, "data.csv")
     df.to_csv(data_fpath, index=False)
     schema = {'data': {'path': op.join("data", "data.csv"),
                        "dataframe_rules": {"drop_duplicates": False}}}
     with open(schema_fpath, "w") as fin:
         yaml.dump(schema, fin, Dumper=Dumper, default_flow_style=False)
     pr.add_project("relpath", schema_fpath)
     try:
         loaded = pr.Project("relpath").load_dataset("data")
         self.assertDataFrameEqual(loaded, df)
     finally:
         pr.remove_project("relpath")
         shutil.rmtree(tempdir)
 def test_index_column_exclude(self):
     """Test if values are excluded from index column if so specified."""
     tempdir = tempfile.mkdtemp()
     schema_fpath = op.join(tempdir, "schema.yml")
     data_fpath = op.join(tempdir, "data.csv")
     df = pd.DataFrame.from_dict({'index': np.arange(10), 'col_a':
                                  np.arange(10)})
     df.to_csv(data_fpath, index=False)
     schema = {'data': {'path': data_fpath, 'index_col': 'index',
                        'column_rules': {'index': {'exclude': [1, 2]}}}}
     with open(schema_fpath, "w") as fin:
         yaml.dump(schema, fin, Dumper=Dumper, default_flow_style=False)
     pr.add_project("index_exclude", schema_fpath)
     try:
         df = pr.Project("index_exclude").load_dataset("data")
         self.assertItemsEqual(df.shape, (8, 1))
         self.assertEqual(df.index.name, "index")
         self.assertNotIn(1, df.index)
         self.assertNotIn(2, df.index)
     finally:
         pr.remove_project("index_exclude")
         shutil.rmtree(tempdir)
 def test_error_bad_lines_correction(self):
     """test if the correction for bad lines works."""
     tempdir = tempfile.mkdtemp()
     iris_path = op.join(op.abspath(op.dirname(__file__)), "testdata",
                         "iris.csv")
     with open(iris_path, "r") as fid:
         iris_lines = fid.readlines()
     outpath = op.join(tempdir, "bad_iris.csv")
     iris_lines[50] = iris_lines[50].rstrip() + ",0,23,\n"
     with open(outpath, 'w') as fid:
         fid.writelines(iris_lines)
     data_dict = op.join(tempdir, "dummy_project.yaml")
     specs = {'bad_iris': {'path': outpath}}
     with open(data_dict, "w") as fid:
         yaml.dump(specs, fid, Dumper=Dumper, default_flow_style=False)
     pr.add_project('dummy_project', data_dict)
     try:
         project = pr.Project('dummy_project')
         df = project.load_dataset('bad_iris')
         self.assertItemsEqual(df.shape, (146, 5))
     finally:
         shutil.rmtree(tempdir)
         pr.remove_project('dummy_project')
Example #20
0
 def test_nrows_shuffling(self):
     """test_relpath"""
     """Test if the shuffle parameter works with the nrows parameter."""
     tempdir = tempfile.mkdtemp()
     schema_fpath = op.join(tempdir, "schema.yml")
     data_fpath = op.join(tempdir, "data.csv")
     X = np.c_[np.arange(10), np.arange(10)]
     ix = range(5) + "a b c d e".split()
     df = pd.DataFrame(X, index=ix)
     df.to_csv(data_fpath, index_label="index")
     schema = {'data': {'path': data_fpath, "index_col": "index",
                        'nrows': {'count': 5, "shuffle": True}}}
     with open(schema_fpath, "w") as fin:
         yaml.dump(schema, fin, Dumper=Dumper, default_flow_style=False)
     pr.add_project("nrows_shuffle", schema_fpath)
     try:
         df = pr.Project("nrows_shuffle").load_dataset("data")
         for row_label in "a b c d e".split():
             self.assertNotIn(row_label, df.index)
         self.assertFalse(np.all(df.index == range(5)))
     finally:
         pr.remove_project("nrows_shuffle")
         shutil.rmtree(tempdir)
    def test_integer_col_na_values(self):
        """Test if the Loader can load columns with integers and NAs.

        This is necessary because NaNs cannot be represented by integers."""
        x = map(str, range(20))
        x[13] = ""
        df = pd.DataFrame.from_dict(dict(a=x, b=x))
        tempdir = tempfile.mkdtemp()
        outfile = op.join(tempdir, "testdata.csv")
        df.to_csv(outfile, index=False)
        specfile = op.join(tempdir, "dict.yaml")
        specs = dict(delimiter=',', dtypes={'a': int, 'b': int}, path=outfile)
        with open(specfile, "w") as fileobj:
            yaml.dump({'testdata': specs}, fileobj, Dumper=Dumper,
                      default_flow_style=False)
        pr.add_project("wrong_dtype", specfile)
        try:
            _pr = pr.Project("wrong_dtype")
            df = _pr.load_dataset("testdata")
            self.assertEqual(df['a'].dtype, float)
            self.assertEqual(df['b'].dtype, float)
        finally:
            pr.remove_project("wrong_dtype")
            shutil.rmtree(tempdir)
 def test_global_na_reps(self):
     """Test is specifying a global NA value for a dataset works."""
     tempdir = tempfile.mkdtemp()
     df = pd.DataFrame(np.random.rand(10, 10))
     ix = np.random.randint(0, df.shape[0], size=(5,))
     ix = np.unique(ix)
     for i in xrange(ix.shape[0]):
         df.iloc[ix[i], ix[i]] = "foobar"
     fpath = op.join(tempdir, "test_na.csv")
     df.to_csv(fpath, index=False)
     schema = {'path': fpath, 'na_values': "foobar",
               'dataframe_rules': {'drop_na': False,
                                   'drop_duplicates': False}}
     schema_fpath = op.join(tempdir, "test_na.yaml")
     with open(schema_fpath, "w") as fid:
         yaml.dump({'test_na': schema}, fid, Dumper=Dumper,
                   default_flow_style=False)
     pr.add_project("test_na", schema_fpath)
     try:
         df = pr.Project("test_na").load_dataset("test_na")
         self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0])
     finally:
         pr.remove_project("test_na")
         shutil.rmtree(tempdir)
 def test_na_reps_list(self):
     """Test if NA values work when specified as a list."""
     tempdir = tempfile.mkdtemp()
     df = pd.DataFrame(np.random.rand(10, 2))
     ix = np.random.randint(0, df.shape[0], size=(5,))
     ix = np.unique(ix)
     df.iloc[ix, 0] = "foo"
     df.iloc[ix, 1] = "bar"
     fpath = op.join(tempdir, "test_na.csv")
     df.to_csv(fpath, index=False)
     schema = {'path': fpath, 'na_values': ["foo", "bar"],
               'dataframe_rules': {'drop_na': False,
                                   'drop_duplicates': False}}
     schema_fpath = op.join(tempdir, "test_na.yaml")
     with open(schema_fpath, "w") as fid:
         yaml.dump({'test_na': schema}, fid, Dumper=Dumper,
                   default_flow_style=False)
     pr.add_project("test_na", schema_fpath)
     try:
         df = pr.Project("test_na").load_dataset("test_na")
         self.assertEqual(pd.isnull(df).sum().sum(), ix.shape[0] * 2)
     finally:
         pr.remove_project("test_na")
         shutil.rmtree(tempdir)
Example #24
0
 def test_remove_project(self):
     """Test if removing a project works properly."""
     self.assertTrue(pr.remove_project("test_project"))
     self.assertRaises(NoSectionError, pr.get_default_specfile,
                       "test_project")
Example #25
0
 def tearDown(self):
     pr.remove_project("dummy_project")
Example #26
0
 def tearDown(self):
     pr.remove_project("dummy_project")
 def test_remove_project(self):
     """Test if removing a project works properly."""
     self.assertTrue(pr.remove_project("test_project"))
     self.assertRaises(NoSectionError, pr.get_default_specfile,
                       "test_project")