def test_updated_file(session_db, tmpdir, caplog): session, db = session_db # Make tmpdir a concrete path otherwise filesystem ops won't work tmpdir = Path(tmpdir) ncfile = "test1.nc" ncpath = Path("test/data/indexing/longnames/output000/") / ncfile shutil.copy(str(ncpath), str(tmpdir / ncfile)) indexed = database.build_index(str(tmpdir), session) assert indexed == 1 # Should not reindex reindexed = database.build_index(str(tmpdir), session) assert reindexed == 0 # Should reindex as file is updated time.sleep(1) (tmpdir / ncfile).touch() reindexed = database.build_index(str(tmpdir), session) assert reindexed == 1 # Should not reindex as flagging as missing will not remove # file from the database, so will not be reindexed time.sleep(1) (tmpdir / ncfile).touch() with caplog.at_level(logging.WARNING): reindexed = database.build_index(str(tmpdir), session, prune="flag") assert reindexed == 0 assert "Set prune to 'delete' to reindex updated files" in caplog.text
def test_delete_experiment(session_db): """Test that we can completely delete an experiment and its associated data. """ session, db = session_db database.build_index("test/data/indexing/longnames", session) # make sure we actually did index something expt = (session.query(database.NCExperiment).filter( database.NCExperiment.experiment == "longnames").one_or_none()) assert expt is not None database.delete_experiment("longnames", session) expt = (session.query(database.NCExperiment).filter( database.NCExperiment.experiment == "longnames").one_or_none()) assert expt is None # check that all files are removed files = session.query(sa.func.count(database.NCFile.id)).scalar() assert files == 0 # make sure all ncvars are removed vars = session.query(sa.func.count(database.NCVar.id)).scalar() assert vars == 0
def test_multiple_experiments(session_db): session, db = session_db # index multiple experiments, which have duplicate data and therefore push # against some unique constraints database.build_index(['test/data/indexing/multiple/experiment_a', 'test/data/indexing/multiple/experiment_b'], session) q = session.query(database.NCExperiment) assert(q.count() == 2)
def test_reindex_noupdate(session_db): session, db = session_db database.build_index('test/data/indexing/broken_file', session) assert(db.check()) # re-run the index, make sure we don't re-index anything reindexed = database.build_index('test/data/indexing/broken_file', session) assert(reindexed == 0)
def test_distributed(client, session_db): session, db = session_db database.build_index("test/data/indexing/broken_file", session, client) assert db.check() q = session.query(database.NCExperiment) r = q.all() assert len(r) == 1
def test_reindex_force(session_db): session, db = session_db database.build_index("test/data/indexing/broken_file", session) assert db.check() # re-run the index, make sure re-index reindexed = database.build_index("test/data/indexing/broken_file", session, force=True) assert reindexed == 1
def test_update_nonew(session_db): session, db = session_db database.build_index("test/data/indexing/broken_file", session) assert db.check() # re-run the index, make sure we don't re-index anything reindexed = database.build_index("test/data/indexing/broken_file", session, prune="flag") assert reindexed == 0
def test_update_newfile(session_db, tmpdir): session, db = session_db shutil.copy('test/data/indexing/longnames/output000/test1.nc', str(tmpdir / 'test1.nc')) database.build_index(str(tmpdir), session) # add another file shutil.copy('test/data/indexing/longnames/output000/test2.nc', str(tmpdir / 'test2.nc')) database.build_index(str(tmpdir), session, update=True)
def test_update_newfile(session_db, tmpdir): session, db = session_db shutil.copy("test/data/indexing/longnames/output000/test1.nc", str(tmpdir / "test1.nc")) database.build_index(str(tmpdir), session) # add another file shutil.copy("test/data/indexing/longnames/output000/test2.nc", str(tmpdir / "test2.nc")) reindexed = database.build_index(str(tmpdir), session) assert reindexed == 1
def test_getvar_with_metadata(session_db): session, db = session_db database.build_index("test/data/indexing/metadata", session) with querying.getvar("metadata", "test", session, decode_times=False) as v: assert v.attrs["long_name"] == "Test Variable" assert v.attrs["contact"] == "The ACCESS Oracle" assert v.attrs["email"] == "*****@*****.**" assert v.attrs["created"] == "2018-01-01" assert "description" in v.attrs
def test_single_broken(session_db): session, db = session_db database.build_index("test/data/indexing/single_broken_file", session) # query ncfiles table -- should have two entries q = session.query(func.count(database.NCFile.id)) assert q.scalar() == 2 # query ncvars table -- should have a single entry q = session.query(func.count(database.NCVar.id)) assert q.scalar() == 1
def test_metadata(session_db): session, db = session_db database.build_index('test/data/indexing/metadata', session) # query metadata q = session.query(database.NCExperiment.contact, database.NCExperiment.created, database.NCExperiment.description) r = q.one() assert(r[0] == 'The ACCESS Oracle') assert(r[1] == datetime(2018, 1, 1)) assert(len(r[2]) > 0)
def test_longnames(session_db): session, db = session_db database.build_index("test/data/indexing/longnames", session) # query ncvars table -- should have two entries q = session.query(func.count(database.NCVar.id)) assert q.scalar() == 2 # query generic table -- should only be a single variable q = session.query(database.CFVariable) r = q.all() assert len(r) == 1 assert r[0].long_name == "Test Variable"
def test_missing_time_bounds(session_db): session, db = session_db database.build_index("test/data/indexing/time_bounds", session) # Should have one experiment q = session.query(database.NCExperiment) assert q.count() == 1 # And one correctly indexed (present) file q = session.query(database.NCFile) r = q.all() assert len(r) == 1 assert r[0].present
def test_same_expt_name(session_db): session, db = session_db # index multiple experiments with different root directories, but the same # final path component (experiment name) database.build_index(['test/data/indexing/multiple/experiment_a', 'test/data/indexing/alternate/experiment_a'], session) # the indexing shouldn't fail, and we should have two distinct experiments # with the same name q = (session .query(database.NCExperiment) .filter(database.NCExperiment.experiment == 'experiment_a')) r = q.all() assert(len(r) == 2) assert(r[0].root_dir != r[1].root_dir)
def test_metadata(session_db): """Test that metadata.yaml is read for an experiment during indexing""" session, db = session_db database.build_index("test/data/indexing/metadata", session) # query metadata q = session.query( database.NCExperiment.contact, database.NCExperiment.created, database.NCExperiment.description, ) r = q.one() assert r[0] == "The ACCESS Oracle" assert r[1] == "2018-01-01" assert len(r[2]) > 0
def test_following_symlinks(session_db): session, db = session_db # Indexing symlinked experiment should fail with default arguments database.build_index("test/data/indexing/symlinked/experiment_a", session) q = session.query(database.NCExperiment) assert q.count() == 0 # Now specify to follow symlinks database.build_index("test/data/indexing/symlinked/experiment_a", session, followsymlinks=True) q = session.query(database.NCExperiment) assert q.count() == 1
def test_prune_missing_experiment(session_db): session, db = session_db database.build_index("test/data/indexing/broken_file", session) assert db.check() # check that we have one file q = session.query(database.NCFile) r = q.all() assert len(r) == 1 # prune experiment experiment = "incorrect_experiment" with pytest.raises(RuntimeError, match="No such experiment: ".format(experiment)): database.prune_experiment(experiment, session)
def test_prune_broken(session_db): session, db = session_db database.build_index("test/data/indexing/broken_file", session) assert db.check() # check that we have one file q = session.query(database.NCFile) r = q.all() assert len(r) == 1 # prune experiment database.prune_experiment("broken_file", session) # now the database should be empty q = session.query(database.NCFile) r = q.all() assert len(r) == 0
def test_time_dimension(session_db): session, db = session_db database.build_index("test/data/indexing/time", session) q = session.query(database.NCFile.time_start, database.NCFile.time_end) assert q.count() == 5 # should pick up 5 files q = q.filter((database.NCFile.time_start is None) | (database.NCFile.time_end is None)) assert q.count() == 0 # but all of them should have times populated # there should be 5 separate time variables q = session.query(database.CFVariable) assert q.count() == 5 # each file should have exactly one time dimension q = (session.query(func.count(database.NCFile.ncvars)).join( database.NCFile.ncvars).group_by(database.NCFile.id)) for r in q.all(): assert r[0] == 1
def test_index_with_prune_delete(session_db, tmpdir): session, db = session_db expt_dir = tmpdir / "expt" expt_dir.mkdir() # copy the file to a new experiment directory and index shutil.copy("test/data/indexing/longnames/output000/test1.nc", str(expt_dir / "test1.nc")) database.build_index(str(expt_dir), session) # check that we have a valid file q = session.query(database.NCFile).filter(database.NCFile.present) r = q.all() assert len(r) == 1 # remove the file and build with pruning os.remove(expt_dir / "test1.nc") database.build_index(str(expt_dir), session, prune="delete") # now we should still have no files q = session.query(database.NCFile) r = q.one_or_none() assert r is None
def test_index_attributes(session_db): session, db = session_db database.build_index("test/data/querying", session) inspector = inspect(session.get_bind()) assert inspector.get_indexes("ncattributes")[0] == { "name": "ix_ncattributes_ncvar_id", "column_names": ["ncvar_id"], "unique": 0, } ncfile = "output000/ocean.nc" # check that we have the right attributes for a file (just use a subset) f = session.query( database.NCFile).filter(database.NCFile.ncfile == ncfile).one() file_attrs = { "filename": "ocean.nc", "title": "MOM5", "grid_type": "mosaic", "grid_tile": "1", } for attr, attr_val in file_attrs.items(): assert attr in f.attrs and f.attrs[attr] == attr_val # and check a particular variable v = (session.query(database.NCVar).join( database.NCFile).filter(database.NCFile.ncfile == ncfile).filter( database.NCVar.varname == "temp").one()) var_attrs = { "long_name": "Potential temperature", "cell_methods": "time: mean", "coordinates": "geolon_t geolat_t", } for attr, attr_val in var_attrs.items(): assert attr in v.attrs and v.attrs[attr] == attr_val
def test_prune_nodelete(session_db, tmpdir): session, db = session_db expt_dir = tmpdir / "expt" expt_dir.mkdir() # copy the file to a new experiment directory and index shutil.copy("test/data/indexing/longnames/output000/test1.nc", str(expt_dir / "test1.nc")) database.build_index(str(expt_dir), session) # check that we have a valid file q = session.query(database.NCFile).filter(database.NCFile.present) r = q.all() assert len(r) == 1 # remove the file and prune os.remove(expt_dir / "test1.nc") database.prune_experiment("expt", session, delete=False) # now we should still have one file, but now not present q = session.query(database.NCFile) r = q.one_or_none() assert r is not None assert not r.present
def test_empty_file(session_db): session, db = session_db indexed = database.build_index("test/data/indexing/empty_file", session) # as with test_broken, we should have seen a single file, # but it should be marked as empty assert db.check() assert indexed == 1 q = session.query(database.NCFile) r = q.all() assert len(r) == 1 assert not r[0].present # but there should be a valid variable q = session.query(func.count( database.NCVar.id)).filter(database.NCVar.varname == "ty_trans_rho") assert q.scalar() == 1
def test_broken(session_db): session, db = session_db indexed = database.build_index("test/data/indexing/broken_file", session) # make sure the database was created assert db.check() # we indexed a single file assert indexed == 1 # query ncfiles table -- should have a single file, marked as empty q = session.query(database.NCFile) r = q.all() assert len(r) == 1 assert not r[0].present # query ncvars table -- should be empty q = session.query(func.count(database.NCVar.id)) assert q.scalar() == 0
def test_unreadable(session_db, unreadable_dir): session, db = session_db with pytest.warns(UserWarning, match="Some files or directories could not be read"): indexed = database.build_index(str(unreadable_dir), session)
def test_get_experiments_metadata(session_db): """Test that get_experiments returns metadata correctly""" session, db = session_db database.build_index("test/data/indexing/metadata", session) r = querying.get_experiments(session, contact=True) df = pd.DataFrame.from_dict({ "experiment": ["metadata"], "contact": ["The ACCESS Oracle"], "ncfiles": [1] }) assert_frame_equal(r, df) r = querying.get_experiments(session, email=True) df = pd.DataFrame.from_dict({ "experiment": ["metadata"], "email": ["*****@*****.**"], "ncfiles": [1] }) assert_frame_equal(r, df) r = querying.get_experiments(session, url=True) df = pd.DataFrame.from_dict({ "experiment": ["metadata"], "url": ["https://github.com/COSIMA/oracle"], "ncfiles": [1], }) assert_frame_equal(r, df) r = querying.get_experiments(session, description=True) df = pd.DataFrame.from_dict({ "experiment": ["metadata"], "description": [("Attempted spinup, using salt flux fix " "https://arccss.slack.com/archives/C6PP0GU9Y/p1515460656000124 " "and https://github.com/mom-ocean/MOM5/pull/208/commits/9f4ee6f8b72b76c96a25bf26f3f6cdf773b424d2 " "from the start. Used mushy ice from July year 1 onwards to avoid vertical thermo error in cice " "https://arccss.slack.com/archives/C6PP0GU9Y/p1515842016000079")], "ncfiles": [1], }) assert_frame_equal(r, df) r = querying.get_experiments(session, notes=True) df = pd.DataFrame.from_dict({ "experiment": ["metadata"], "notes": [("Stripy salt restoring: " "https://github.com/OceansAus/access-om2/issues/74 tripole seam bug: " "https://github.com/OceansAus/access-om2/issues/86 requires dt=300s " "in May, dt=240s in Aug to maintain CFL in CICE near tripoles (storms " "in those months in 8485RYF); all other months work with dt=400s")], "ncfiles": [1], }) assert_frame_equal(r, df) r = querying.get_experiments(session, created=True) df = pd.DataFrame.from_dict({ "experiment": ["metadata"], "created": ["2018-01-01"], "ncfiles": [1] }) assert_frame_equal(r, df) r = querying.get_experiments(session, root_dir=True) # Won't try and match a path that can change on different platforms # assert_frame_equal(r, df) assert r.shape == (1, 3) r = querying.get_experiments(session, all=True) # Won't try and match everything, just check dimensions are correct assert r.shape == (1, 9) # Test turning off returning experiment (bit dumb, but hey ...) r = querying.get_experiments(session, experiment=False) df = pd.DataFrame.from_dict({"ncfiles": [1]}) assert_frame_equal(r, df)
def test_broken_metadata(session_db): session, db = session_db indexed = database.build_index("test/data/indexing/broken_metadata", session) assert indexed == 1
def test_get_experiments_with_keywords(session_db): """Test retrieval of experiments with keyword filtering""" session, db = session_db database.build_index("test/data/metadata/keywords", session) database.build_index("test/data/metadata/keywords2", session) # Test keyword common to both experiments r = querying.get_experiments(session, keywords="cosima") df = pd.DataFrame.from_dict({ "experiment": ["keywords", "keywords2"], "ncfiles": [1, 1] }) assert_frame_equal(r, df) # Test keyword common to both experiments using wildcard r = querying.get_experiments(session, keywords="cos%") df = pd.DataFrame.from_dict({ "experiment": ["keywords", "keywords2"], "ncfiles": [1, 1] }) assert_frame_equal(r, df) r = querying.get_experiments(session, keywords="%-%") df = pd.DataFrame.from_dict({ "experiment": ["keywords", "keywords2"], "ncfiles": [1, 1] }) assert_frame_equal(r, df) r = querying.get_experiments(session, keywords="access-om2%") df = pd.DataFrame.from_dict({"experiment": ["keywords"], "ncfiles": [1]}) assert_frame_equal(r, df) # Test keyword in only one experiment r = querying.get_experiments(session, keywords="another-keyword") df = pd.DataFrame.from_dict({"experiment": ["keywords2"], "ncfiles": [1]}) assert_frame_equal(r, df) r = querying.get_experiments(session, keywords="ryf9091") df = pd.DataFrame.from_dict({"experiment": ["keywords"], "ncfiles": [1]}) assert_frame_equal(r, df) # Test passing an array of keywords that match only one experiment r = querying.get_experiments(session, keywords=["cosima", "another-keyword"]) df = pd.DataFrame.from_dict({"experiment": ["keywords2"], "ncfiles": [1]}) assert_frame_equal(r, df) # Test passing an array of keywords that will not match any one experiment r = querying.get_experiments(session, keywords=["another-keyword", "ryf9091"]) df = pd.DataFrame(columns=["experiment", "ncfiles"]) assert_frame_equal(r, df) # Test passing a non-existent keyword along with one present. Should return # nothing as no experiment contains it r = querying.get_experiments(session, keywords=["ryf9091", "not-a-keyword"]) df = pd.DataFrame(columns=["experiment", "ncfiles"]) assert_frame_equal(r, df) # Test passing only a non-existent keyword r = querying.get_experiments(session, keywords=["not-a-keyword"]) df = pd.DataFrame(columns=["experiment", "ncfiles"]) assert_frame_equal(r, df) # Test passing only a non-existent wildcard keyword r = querying.get_experiments(session, keywords=["z%"]) df = pd.DataFrame(columns=["experiment", "ncfiles"]) assert_frame_equal(r, df)