Exemple #1
0
    def set_database_client(self, database_host, database_port=None):
        """
        Set a database client by database_host(and database_port)

        :param database_host: the host address of database client
        :type database_host: :class:`str`
        :param database_port: the port of database client
        :type database_port: :class:`str`
        """
        database_host_has_port = False
        database_address = database_host
        # check if port is already in the database_host address
        if ":" in database_host:
            database_host_has_port = True
        # add port
        if not database_host_has_port and database_port:
            database_address += ":" + database_port
        # sanity check
        temp_db_client = self._db_client
        try:
            self._db_client = DBClient(database_address)
            self._db_client.server_info()
        except Exception as err:
            # restore the _db_client
            self._db_client = temp_db_client
            raise MsPASSError(
                "Runntime error: cannot create a database client with: " +
                database_address,
                "Fatal",
            )
Exemple #2
0
    def setup_class(self):
        self.client = DBClient("localhost")
        self.client.drop_database("test_manager")
        db = Database(self.client, "test_manager")
        db["history_global"].drop_indexes()
        # clean up the database locally
        for col_name in db.list_collection_names():
            db[col_name].delete_many({})

        self.manager = GlobalHistoryManager(db,
                                            "test_job",
                                            collection="history_global")
Exemple #3
0
    def setup_class(self):
        client = DBClient("localhost")
        self.db = Database(client, "test_dbclean")

        self.test_ts = get_live_timeseries()
        site_id = ObjectId()
        channel_id = ObjectId()
        source_id = ObjectId()
        self.db["site"].insert_one({
            "_id": site_id,
            "net": "net",
            "sta": "sta",
            "loc": "loc",
            "lat": 1.0,
            "lon": 1.0,
            "elev": 2.0,
            "starttime": datetime.utcnow().timestamp(),
            "endtime": datetime.utcnow().timestamp(),
        })
        self.db["channel"].insert_one({
            "_id":
            channel_id,
            "net":
            "net1",
            "sta":
            "sta1",
            "loc":
            "loc1",
            "chan":
            "chan",
            "lat":
            1.1,
            "lon":
            1.1,
            "elev":
            2.1,
            "starttime":
            datetime.utcnow().timestamp(),
            "endtime":
            datetime.utcnow().timestamp(),
            "edepth":
            3.0,
            "vang":
            1.0,
            "hang":
            1.0,
        })
        self.db["source"].insert_one({
            "_id": source_id,
            "lat": 1.2,
            "lon": 1.2,
            "time": datetime.utcnow().timestamp(),
            "depth": 3.1,
            "magnitude": 1.0,
        })
        self.test_ts["site_id"] = site_id
        self.test_ts["source_id"] = source_id
        self.test_ts["channel_id"] = channel_id
Exemple #4
0
class TestDBClient:
    def setup_class(self):
        self.c1 = DBClient("mongodb://localhost/my_database")
        self.c2 = DBClient("localhost")

    def test_init(self):
        assert self.c1._DBClient__default_database_name == "my_database"

    def test_getitem(self):
        assert self.c1["my_database"].name == "my_database"
        assert self.c2["my_db"].name == "my_db"

    def test_get_default_database(self):
        assert self.c1.get_default_database().name == "my_database"
        with pytest.raises(pymongo.errors.ConfigurationError,
                           match="No default database"):
            self.c2.get_default_database()

    def test_get_database(self):
        assert self.c1.get_database().name == "my_database"
        assert self.c2.get_database("my_db").name == "my_db"
        with pytest.raises(pymongo.errors.ConfigurationError,
                           match="No default database"):
            self.c2.get_database()
Exemple #5
0
class TestManager:
    def setup_class(self):
        self.client = DBClient("localhost")
        self.client.drop_database("test_manager")
        db = Database(self.client, "test_manager")
        db["history_global"].drop_indexes()
        # clean up the database locally
        for col_name in db.list_collection_names():
            db[col_name].delete_many({})

        self.manager = GlobalHistoryManager(db,
                                            "test_job",
                                            collection="history_global")

    def test_init(self):
        assert self.manager.job_name == "test_job"
        assert self.manager.collection == "history_global"
        assert self.manager.history_db.name == "test_manager"

    def test_logging(self):
        alg_id = ObjectId()
        manager_db = Database(self.client, "test_manager")
        manager_db["history_global"].delete_many({})
        self.manager.logging(alg_id, "test_alg_name", "test_parameter")
        res = manager_db["history_global"].find_one(
            {"job_name": self.manager.job_name})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "test_alg_name"
        assert res["alg_id"] == alg_id
        assert res["parameters"] == "test_parameter"
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 1)
        # clean up
        manager_db["history_global"].delete_many({})

    def test_mspass_map(self, spark_context):
        l = [get_live_timeseries() for i in range(5)]
        # add net, sta, chan, loc to avoid metadata serialization problem
        for i in range(5):
            l[i]["chan"] = "HHZ"
            l[i]["loc"] = "test_loc"
            l[i]["net"] = "test_net"
            l[i]["sta"] = "test_sta"
            l[i].set_as_origin("test", "0", str(i), AtomicType.TIMESERIES)
        # test mspass_map for spark
        spark_res = spark_map(l, self.manager, spark_context)

        manager_db = Database(self.client, "test_manager")
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 1)
        res = manager_db["history_global"].find_one(
            {"job_name": self.manager.job_name})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "filter"
        assert (
            res["parameters"] ==
            '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}'
        )
        spark_alg_id = res["alg_id"]

        # test mspass_map for dask
        dask_res = dask_map(l, self.manager)

        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 2)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 2)
        docs = manager_db["history_global"].find({"alg_id": spark_alg_id})
        assert docs[0]["job_id"] == docs[1]["job_id"] == self.manager.job_id
        assert docs[0]["job_name"] == docs[1][
            "job_name"] == self.manager.job_name
        assert docs[0]["alg_name"] == docs[1]["alg_name"] == "filter"
        assert (
            docs[0]["parameters"] == docs[1]["parameters"] ==
            '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}'
        )
        assert not docs[0]["time"] == docs[1]["time"]

        # same alg + parameters combination -> same alg_id
        dask_res = dask_map(l, self.manager)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 3)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 3)

        # SPARK test user provided alg_name and parameter(exist)
        spark_alg_name = "filter"
        spark_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True"
        spark_res = spark_map(
            l,
            self.manager,
            spark_context,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        )
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 4)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 4)

        # SPARK test user provided alg_name and parameter(new)
        spark_alg_name = "new_filter"
        spark_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True"
        spark_res = spark_map(
            l,
            self.manager,
            spark_context,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        )
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 5)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "new_filter"}) == 1)
        res = manager_db["history_global"].find_one({"alg_name": "new_filter"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "new_filter"
        assert (
            res["parameters"] ==
            '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}'
        )
        new_spark_alg_id = res["alg_id"]
        assert (manager_db["history_global"].count_documents(
            {"alg_id": new_spark_alg_id}) == 1)

        # DASK test user provided alg_name and parameter(exist)
        dask_alg_name = "filter"
        dask_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True"
        dask_res = dask_map(l,
                            self.manager,
                            alg_name=dask_alg_name,
                            parameters=dask_alg_parameters)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 6)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 5)

        # DASK test user provided alg_name and parameter(new)
        dask_alg_name = "new_filter_2"
        dask_alg_parameters = "bandpass,freqmin=1,freqmax=5,object_history=True"
        dask_res = dask_map(l,
                            self.manager,
                            alg_name=dask_alg_name,
                            parameters=dask_alg_parameters)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 7)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "new_filter_2"}) == 1)
        res = manager_db["history_global"].find_one(
            {"alg_name": "new_filter_2"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "new_filter_2"
        assert (
            res["parameters"] ==
            '{"arg_0": "bandpass", "freqmin": "1", "freqmax": "5", "object_history": "True"}'
        )
        new_dask_alg_id = res["alg_id"]
        assert (manager_db["history_global"].count_documents(
            {"alg_id": new_dask_alg_id}) == 1)

        manager_db["history_object"].delete_many({})
        # test spark mspass_map for save_data
        data = spark_context.parallelize(l)
        data_map = data.mspass_map(manager_db.save_data,
                                   global_history=self.manager)
        save_list = data_map.collect()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 8)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "save_data"}) == 1)
        # check object history after save_data
        manager_db["history_object"].count_documents({}) == 5
        manager_db["wf_TimeSeries"].count_documents({}) == 5
        history_object_docs = manager_db["history_object"].find({})
        idx = 0
        doc_alg_id = None
        doc_ids = []
        for doc in history_object_docs:
            if not doc_alg_id:
                doc_alg_id = doc["alg_id"]
            else:
                assert doc_alg_id == doc["alg_id"]
            doc_ids.append(doc["_id"])
            assert doc["alg_name"] == "save_data"
            idx += 1
        assert sorted(doc_ids) == ["0", "1", "2", "3", "4"]

        # test spark mspass_map for read_data
        save_l = [res[1] for res in save_list]
        data = spark_context.parallelize(save_l)
        data_map = data.mspass_map(manager_db.read_data,
                                   global_history=self.manager)
        read_list = data_map.collect()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 9)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "read_data"}) == 1)

        manager_db["history_object"].delete_many({})
        manager_db["wf_TimeSeries"].delete_many({})
        # test dask mspass_map for save_data
        data = daskbag.from_sequence(l)
        data_map = data.mspass_map(manager_db.save_data,
                                   global_history=self.manager)
        save_list = data_map.compute()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 10)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "save_data"}) == 2)
        res = manager_db["history_global"].find({"alg_name": "save_data"})
        assert res[0]["job_id"] == res[1]["job_id"] == self.manager.job_id
        assert res[0]["job_name"] == res[1]["job_name"] == self.manager.job_name
        assert res[0]["alg_name"] == res[1]["alg_name"] == "save_data"
        assert (res[0]["parameters"] == res[1]["parameters"] ==
                '{"object_history": "False"}')
        assert res[0]["alg_id"] == res[1]["alg_id"]
        # check object history after save_data
        manager_db["history_object"].count_documents({}) == 5
        manager_db["wf_TimeSeries"].count_documents({}) == 5
        history_object_docs = manager_db["history_object"].find({})
        idx = 0
        doc_alg_id = None
        doc_ids = []
        for doc in history_object_docs:
            if not doc_alg_id:
                doc_alg_id = doc["alg_id"]
            else:
                assert doc_alg_id == doc["alg_id"]
            doc_ids.append(doc["_id"])
            assert doc["alg_name"] == "save_data"
            idx += 1
        assert sorted(doc_ids) == ["0", "1", "2", "3", "4"]

        # test dask mspass_map for read_data
        save_l = [res[1] for res in save_list]
        data = daskbag.from_sequence(save_l)
        data_map = data.mspass_map(manager_db.read_data,
                                   global_history=self.manager)
        read_list = data_map.compute()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 11)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "read_data"}) == 2)
        res = manager_db["history_global"].find({"alg_name": "read_data"})
        assert res[0]["job_id"] == res[1]["job_id"] == self.manager.job_id
        assert res[0]["job_name"] == res[1]["job_name"] == self.manager.job_name
        assert res[0]["alg_name"] == res[1]["alg_name"] == "read_data"
        assert (res[0]["parameters"] == res[1]["parameters"] ==
                '{"object_history": "False"}')
        assert res[0]["alg_id"] == res[1]["alg_id"]

    def test_mspass_reduce(self, spark_context):
        manager_db = Database(self.client, "test_manager")
        manager_db["history_global"].delete_many({})

        l = [get_live_timeseries() for i in range(5)]
        # test mspass_reduce for spark
        spark_res = spark_reduce(l, self.manager, spark_context)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 1)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "stack"}) == 1
        res = manager_db["history_global"].find_one({"alg_name": "stack"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "stack"
        assert res["parameters"] == '{"object_history": "True", "alg_id": "2"}'
        spark_alg_id = res["alg_id"]

        # test mspass_reduce for dask
        dask_res = dask_reduce(l, self.manager)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 2)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "stack"}) == 2
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 1)

        docs = manager_db["history_global"].find({"alg_name": "stack"})
        for doc in docs:
            if doc["alg_id"] == spark_alg_id:
                continue
            res = doc
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "stack"
        assert res["parameters"] == '{"object_history": "True", "alg_id": "3"}'
        # different alg -> different alg_id
        assert not res["alg_id"] == spark_alg_id
        dask_alg_id = res["alg_id"]

        # same alg + parameters combination -> same alg_id
        dask_res = dask_reduce(l, self.manager)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 3)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "stack"}) == 3
        assert (manager_db["history_global"].count_documents(
            {"alg_id": dask_alg_id}) == 2)
        docs = manager_db["history_global"].find({"alg_id": dask_alg_id})
        doc1 = docs[0]
        doc2 = docs[1]
        assert not doc1["time"] == doc2["time"]
        assert doc1["job_id"] == doc2["job_id"]
        assert doc1["job_name"] == doc2["job_name"]
        assert doc1["alg_name"] == doc2["alg_name"]
        assert doc1["parameters"] == doc2["parameters"]

        # SPARK test user provided alg_name and parameter(exist)
        spark_alg_name = "stack"
        spark_alg_parameters = "object_history=True,alg_id=2"
        spark_res = spark_reduce(
            l,
            self.manager,
            spark_context,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        )
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 4)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "stack"}) == 4
        assert (manager_db["history_global"].count_documents({
            "alg_name":
            "stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        }) == 2)

        # SPARK test user provided alg_name and parameter(new)
        spark_alg_name = "new_stack"
        spark_alg_parameters = "object_history=True,alg_id=2"
        spark_res = spark_reduce(
            l,
            self.manager,
            spark_context,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        )
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 5)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "new_stack"}) == 1)
        res = manager_db["history_global"].find_one({"alg_name": "new_stack"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "new_stack"
        assert res["parameters"] == '{"object_history": "True", "alg_id": "2"}'

        # DASK test user provided alg_name and parameter(exist)
        dask_alg_name = "stack"
        dask_alg_parameters = "object_history=True,alg_id=3"
        dask_res = dask_map(l,
                            self.manager,
                            alg_name=dask_alg_name,
                            parameters=dask_alg_parameters)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 6)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "stack"}) == 5
        assert (manager_db["history_global"].count_documents({
            "alg_name":
            "stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        }) == 3)

        # DASK test user provided alg_name and parameter(new)
        dask_alg_name = "new_stack"
        dask_alg_parameters = "object_history=True,alg_id=3"
        dask_res = dask_map(l,
                            self.manager,
                            alg_name=dask_alg_name,
                            parameters=dask_alg_parameters)
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 7)
        assert (manager_db["history_global"].count_documents({
            "alg_name":
            "new_stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        }) == 1)
        res = manager_db["history_global"].find_one({
            "alg_name":
            "new_stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        })
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "new_stack"
        assert res["parameters"] == '{"object_history": "True", "alg_id": "3"}'

    def test_mspass_map_with_filePath(self, spark_context):
        # test mapass_map for spark (file input)
        # data input of RFdecon, needed for parallelization
        d = [get_live_seismogram(71, 2.0) for i in range(5)]
        for i in range(5):
            d[i].t0 = -5

        # parameters string
        pfPath = "python/mspasspy/data/pf/RFdeconProcessor.pf"
        pf = AntelopePf(pfPath)
        pf_dict = AntelopePf2dict(pf)
        parameter_dict = collections.OrderedDict()
        parameter_dict["alg"] = "LeastSquares"
        parameter_dict["pf"] = pf_dict
        parameter_dict["object_history"] = "True"
        gTree = ParameterGTree(parameter_dict)
        json_params = json.dumps(gTree.asdict())

        data = spark_context.parallelize(d)
        spark_res = data.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=None,
            parameters=None,
        ).collect()
        manager_db = Database(self.client, "test_manager")
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 8)
        res = manager_db["history_global"].find_one({"alg_name": "RFdecon"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "RFdecon"
        assert res["parameters"] == json_params
        spark_alg_id = res["alg_id"]

        # test mspass_map for dask
        ddb = daskbag.from_sequence(d)
        dask_res = ddb.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=None,
            parameters=None,
        ).compute()

        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 9)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 2)
        docs = manager_db["history_global"].find({"alg_id": spark_alg_id})
        assert docs[0]["job_id"] == docs[1]["job_id"] == self.manager.job_id
        assert docs[0]["job_name"] == docs[1][
            "job_name"] == self.manager.job_name
        assert docs[0]["alg_name"] == docs[1]["alg_name"] == "RFdecon"
        assert docs[0]["parameters"] == docs[1]["parameters"] == json_params
        assert not docs[0]["time"] == docs[1]["time"]

        # same alg + parameters combination -> same alg_id
        ddb = daskbag.from_sequence(d)
        dask_res = ddb.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=None,
            parameters=None,
        ).compute()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 10)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 3)

        # SPARK test user provided alg_name and parameter(exist)
        spark_alg_name = "RFdecon"
        spark_alg_parameters = (
            "alg=LeastSquares, pf={pfPath}, object_history=True".format(
                pfPath=pfPath))
        data = spark_context.parallelize(d)
        spark_res = data.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        ).collect()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 11)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 4)

        # SPARK test user provided alg_name and parameter(new)
        spark_alg_name = "RFdecon_2"
        spark_alg_parameters = (
            "alg=LeastSquares, pf={pfPath}, object_history=True".format(
                pfPath=pfPath))
        data = spark_context.parallelize(d)
        spark_res = data.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=spark_alg_name,
            parameters=spark_alg_parameters,
        ).collect()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 12)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "RFdecon_2"}) == 1)
        res = manager_db["history_global"].find_one({"alg_name": "RFdecon_2"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "RFdecon_2"
        assert res["parameters"] == json_params
        new_spark_alg_id = res["alg_id"]
        assert (manager_db["history_global"].count_documents(
            {"alg_id": new_spark_alg_id}) == 1)

        # DASK test user provided alg_name and parameter(exist)
        dask_alg_name = "RFdecon"
        dask_alg_parameters = (
            "alg=LeastSquares, pf={pfPath}, object_history=True".format(
                pfPath=pfPath))
        ddb = daskbag.from_sequence(d)
        dask_res = ddb.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=dask_alg_name,
            parameters=dask_alg_parameters,
        ).compute()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 13)
        assert (manager_db["history_global"].count_documents(
            {"alg_id": spark_alg_id}) == 5)

        # DASK test user provided alg_name and parameter(new)
        dask_alg_name = "RFdecon_3"
        dask_alg_parameters = (
            "alg=LeastSquares, pf={pfPath}, object_history=True".format(
                pfPath=pfPath))
        ddb = daskbag.from_sequence(d)
        dask_res = ddb.mspass_map(
            RFdecon,
            alg="LeastSquares",
            pf=pfPath,
            object_history=True,
            global_history=self.manager,
            alg_name=dask_alg_name,
            parameters=dask_alg_parameters,
        ).compute()
        assert (manager_db["history_global"].count_documents(
            {"job_name": self.manager.job_name}) == 14)
        assert (manager_db["history_global"].count_documents(
            {"alg_name": "RFdecon_3"}) == 1)
        res = manager_db["history_global"].find_one({"alg_name": "RFdecon_3"})
        assert res["job_id"] == self.manager.job_id
        assert res["job_name"] == self.manager.job_name
        assert res["alg_name"] == "RFdecon_3"
        assert res["parameters"] == json_params
        new_dask_alg_id = res["alg_id"]
        assert (manager_db["history_global"].count_documents(
            {"alg_id": new_dask_alg_id}) == 1)

    def test_get_alg_id(self):
        manager_db = Database(self.client, "test_manager")
        assert not self.manager.get_alg_id("aaa", "bbb")
        res = manager_db["history_global"].find_one({
            "alg_name":
            "new_stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        })
        assert (self.manager.get_alg_id(
            "new_stack",
            '{"object_history": "True", "alg_id": "3"}') == res["alg_id"])

    def test_get_alg_list(self):
        assert (len(
            self.manager.get_alg_list(self.manager.job_name,
                                      job_id=self.manager.job_id)) == 14)

    def test_set_alg_name_and_parameters(self):
        manager_db = Database(self.client, "test_manager")
        assert (manager_db["history_global"].count_documents({
            "alg_name":
            "stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        }) == 3)
        res = manager_db["history_global"].find_one({
            "alg_name":
            "stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        })
        alg_id = res["alg_id"]
        self.manager.set_alg_name_and_parameters(alg_id, "test_alg_name",
                                                 "test_parameters")
        assert (manager_db["history_global"].count_documents({
            "alg_name":
            "stack",
            "parameters":
            '{"object_history": "True", "alg_id": "3"}',
        }) == 0)
        assert (manager_db["history_global"].count_documents({
            "alg_name":
            "test_alg_name",
            "parameters":
            "test_parameters"
        }) == 3)
        res = manager_db["history_global"].find_one({
            "alg_name":
            "test_alg_name",
            "parameters":
            "test_parameters"
        })
        assert res["alg_id"] == alg_id

    def test_object_history(self, spark_context):
        manager_db = Database(self.client, "test_manager")
        manager_db["history_global"].delete_many({})
        manager_db["history_object"].delete_many({})
        l = [get_live_timeseries() for i in range(2)]
        # add net, sta, chan, loc to avoid metadata serialization problem
        for i in range(2):
            l[i]["chan"] = "HHZ"
            l[i]["loc"] = "test_loc"
            l[i]["net"] = "test_net"
            l[i]["sta"] = "test_sta"
        spark_res = spark_map(l, self.manager, spark_context)
        assert manager_db["history_global"].count_documents(
            {"alg_name": "filter"}) == 1
        res = manager_db["history_global"].find_one({"alg_name": "filter"})
        alg_id = res["alg_id"]
        # check status of the mspass objects
        for ts in spark_res:
            assert ts.number_of_stages() == 1
            assert ts.current_nodedata().algorithm == "filter"
            assert ts.current_nodedata().algid == str(alg_id)
            assert ts.is_volatile()

        save_res = manager_db.save_data(spark_res[0],
                                        alg_name="filter",
                                        alg_id=str(alg_id))
        # hardcode net, sta, net, loc to avoid serialization problem here, they are readonly metadata keys -> non fatal keys = 4
        assert save_res.live
        assert manager_db["history_object"].count_documents(
            {"alg_name": "filter"}) == 1
        doc = manager_db["history_object"].find_one({"alg_name": "filter"})
        assert doc
        assert doc["_id"] == spark_res[0].current_nodedata().uuid
        assert doc["wf_TimeSeries_id"] == spark_res[0]["_id"]
        assert doc["alg_id"] == str(alg_id)
        assert doc["alg_name"] == "filter"
Exemple #6
0
def main(args=None):
    """ """
    if args is None:
        args = sys.argv[1:]
    parser = argparse.ArgumentParser(
        prog="dbclean",
        usage=
        "%(prog)s dbname collection [-ft] [-d k1 ...] [-r kold:knew ... ] [-v] [-h]",
        description="MsPASS program to fix most errors detected by dbverify",
    )
    parser.add_argument("dbname",
                        metavar="dbname",
                        type=str,
                        help="MongoDB database name to be fixed")
    parser.add_argument(
        "collection",
        metavar="collection",
        type=str,
        help="MongoDB collection name to be fixed",
    )
    parser.add_argument(
        "-ft",
        "--fixtypes",
        action="store_true",
        help="Enable automatic type mismatch repair",
    )
    parser.add_argument(
        "-d",
        "--delete",
        nargs="*",
        default=[],
        help="List of keys of key-value pairs to be deleted from all documents",
    )
    parser.add_argument(
        "-r",
        "--rename",
        nargs="*",
        default=[],
        help=
        "Change the keys of documents using pattern defined in args of form oldkey:newkey",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help="When used be echo each fix - default works silently",
    )

    args = parser.parse_args(args)
    dbname = args.dbname
    collection = args.collection
    fixtypes = args.fixtypes
    delete = args.delete
    rename = args.rename
    verbose = args.verbose

    # not a very robust way to detect this condition but it should work
    # it is not robust because it assumes a behavior in argparse for
    # args with a list
    if len(delete) > 0:
        enable_deletion = True
    else:
        enable_deletion = False
    if len(rename) > 0:
        enable_rename = True
    else:
        enable_rename = False
    if not (fixtypes or enable_deletion or enable_rename):
        print("Usage error:  you must define at least one clean operation")
        print("Type:  dbclean --help to get usage help")
        exit(-1)

    if enable_rename:
        rename_map = rename_list_to_dict(rename)

    dbclient = DBClient()
    db = Database(dbclient, dbname)
    print("Starting processing of ", collection,
          " collection of database named=", dbname)

    # Intentionally do the delete and rename operations before
    # a type check to allow cleaning any keys. The set of dicts below
    # accumulate counts of edits for each key

    if enable_deletion:
        delcounts = db._delete_attributes(collection, delete, verbose=verbose)
        print("delete processing compeleted on collection=", collection)
        print("Number of documents changed for each key requested follow:")
        print(json_util.dumps(delcounts, indent=4))
    if enable_rename:
        repcounts = db._rename_attributes(collection,
                                          rename_map,
                                          verbose=verbose)
        print("rename processing compeleted on collection=", collection)
        print("Here is the set of changes requested:")
        print(json_util.dumps(rename_map))
        print("Number of documents changed for each key requested follow:")
        print(json_util.dumps(repcounts, indent=4))
    if fixtypes:
        fixcounts = db._fix_attribute_types(collection, verbose=verbose)
        print("fixtype processing compeleted on collection=", collection)
        print("Keys of documents changed and number changed follow:")
        print(json_util.dumps(fixcounts, indent=4))
Exemple #7
0
 def setup_class(self):
     self.c1 = DBClient("mongodb://localhost/my_database")
     self.c2 = DBClient("localhost")
Exemple #8
0
    def __init__(
        self,
        database_host=None,
        scheduler=None,
        scheduler_host=None,
        job_name="mspass",
        database_name="mspass",
        schema=None,
        collection=None,
    ):
        # job_name should be a string
        if database_host is not None and not type(database_host) is str:
            raise MsPASSError(
                "database_host should be a string but " +
                str(type(database_host)) + " is found.",
                "Fatal",
            )
        if scheduler is not None and scheduler != "dask" and scheduler != "spark":
            raise MsPASSError(
                "scheduler should be either dask or spark but " +
                str(scheduler) + " is found.",
                "Fatal",
            )
        if scheduler_host is not None and not type(scheduler_host) is str:
            raise MsPASSError(
                "scheduler_host should be a string but " +
                str(type(scheduler_host)) + " is found.",
                "Fatal",
            )
        if job_name is not None and not type(job_name) is str:
            raise MsPASSError(
                "job_name should be a string but " + str(type(job_name)) +
                " is found.",
                "Fatal",
            )
        if database_name is not None and not type(database_name) is str:
            raise MsPASSError(
                "database_name should be a string but " +
                str(type(database_name)) + " is found.",
                "Fatal",
            )
        # collection should be a string
        if collection is not None and type(collection) is not str:
            raise MsPASSError(
                "collection should be a string but " + str(type(collection)) +
                " is found.",
                "Fatal",
            )

        # check env variables
        MSPASS_DB_ADDRESS = os.environ.get("MSPASS_DB_ADDRESS")
        MONGODB_PORT = os.environ.get("MONGODB_PORT")
        MSPASS_SCHEDULER = os.environ.get("MSPASS_SCHEDULER")
        MSPASS_SCHEDULER_ADDRESS = os.environ.get("MSPASS_SCHEDULER_ADDRESS")
        DASK_SCHEDULER_PORT = os.environ.get("DASK_SCHEDULER_PORT")
        SPARK_MASTER_PORT = os.environ.get("SPARK_MASTER_PORT")

        # create a database client
        # priority: parameter -> env -> default
        database_host_has_port = False
        if database_host:
            database_address = database_host
            # check if database_host contains port number already
            if ":" in database_address:
                database_host_has_port = True

        elif MSPASS_DB_ADDRESS:
            database_address = MSPASS_DB_ADDRESS
        else:
            database_address = "localhost"
        # add port
        if not database_host_has_port and MONGODB_PORT:
            database_address += ":" + MONGODB_PORT

        try:
            self._db_client = DBClient(database_address)
            self._db_client.server_info()
        except Exception as err:
            raise MsPASSError(
                "Runntime error: cannot create a database client with: " +
                database_address,
                "Fatal",
            )

        # set default database name
        self._default_database_name = database_name
        self._default_schema = schema
        self._default_collection = collection

        # create a Global History Manager
        if schema:
            global_history_manager_db = Database(self._db_client,
                                                 database_name,
                                                 db_schema=schema)
        else:
            global_history_manager_db = Database(self._db_client,
                                                 database_name)
        self._global_history_manager = GlobalHistoryManager(
            global_history_manager_db, job_name, collection=collection)

        # set scheduler
        if scheduler:
            self._scheduler = scheduler
        elif MSPASS_SCHEDULER:
            self._scheduler = MSPASS_SCHEDULER
        else:
            self._scheduler = "dask"

        # scheduler configuration
        if self._scheduler == "spark":
            scheduler_host_has_port = False
            if scheduler_host:
                self._spark_master_url = scheduler_host
                # add spark:// prefix if not exist
                if "spark://" not in scheduler_host:
                    self._spark_master_url = "spark://" + self._spark_master_url
                # check if spark host address contains port number already
                if self._spark_master_url.count(":") == 2:
                    scheduler_host_has_port = True

            elif MSPASS_SCHEDULER_ADDRESS:
                self._spark_master_url = MSPASS_SCHEDULER_ADDRESS
                # add spark:// prefix if not exist
                if "spark://" not in MSPASS_SCHEDULER_ADDRESS:
                    self._spark_master_url = "spark://" + self._spark_master_url
            else:
                self._spark_master_url = "local"

            # add port number
            # 1. not the default 'local'
            # 2. scheduler_host and does not contain port number
            # 3. SPARK_MASTER_PORT exists
            if ((scheduler_host or MSPASS_SCHEDULER_ADDRESS)
                    and not scheduler_host_has_port and SPARK_MASTER_PORT):
                self._spark_master_url += ":" + SPARK_MASTER_PORT

            # sanity check
            try:
                spark = (SparkSession.builder.appName("mspass").master(
                    self._spark_master_url).getOrCreate())
                self._spark_context = spark.sparkContext
            except Exception as err:
                raise MsPASSError(
                    "Runntime error: cannot create a spark configuration with: "
                    + self._spark_master_url,
                    "Fatal",
                )

        elif self._scheduler == "dask":
            # if no defind scheduler_host and no MSPASS_SCHEDULER_ADDRESS, use local cluster to create a client
            if not scheduler_host and not MSPASS_SCHEDULER_ADDRESS:
                self._dask_client = DaskClient()
            else:
                scheduler_host_has_port = False
                # set host
                if scheduler_host:
                    self._dask_client_address = scheduler_host
                    # check if scheduler_host contains port number already
                    if ":" in scheduler_host:
                        scheduler_host_has_port = True
                else:
                    self._dask_client_address = MSPASS_SCHEDULER_ADDRESS

                # add port
                if not scheduler_host_has_port and DASK_SCHEDULER_PORT:
                    self._dask_client_address += ":" + DASK_SCHEDULER_PORT
                else:
                    # use to port 8786 by default if not specified
                    self._dask_client_address += ":8786"
                # sanity check
                try:
                    self._dask_client = DaskClient(self._dask_client_address)
                except Exception as err:
                    raise MsPASSError(
                        "Runntime error: cannot create a dask client with: " +
                        self._dask_client_address,
                        "Fatal",
                    )
Exemple #9
0
class Client:
    """
    A client-side representation of MSPASS.

    This is the only client users should use in MSPASS. The client manages all the other clients or instances.
    It creates and manages a Database client.
    It creates and manages a Global Hisotry Manager.
    It creates and manages a scheduler(spark/dask)

    For the address and port of each client/instances, we first check the user specified parameters, if not then
    serach the environment varibales values, if not againm then use the default settings.
    """
    def __init__(
        self,
        database_host=None,
        scheduler=None,
        scheduler_host=None,
        job_name="mspass",
        database_name="mspass",
        schema=None,
        collection=None,
    ):
        # job_name should be a string
        if database_host is not None and not type(database_host) is str:
            raise MsPASSError(
                "database_host should be a string but " +
                str(type(database_host)) + " is found.",
                "Fatal",
            )
        if scheduler is not None and scheduler != "dask" and scheduler != "spark":
            raise MsPASSError(
                "scheduler should be either dask or spark but " +
                str(scheduler) + " is found.",
                "Fatal",
            )
        if scheduler_host is not None and not type(scheduler_host) is str:
            raise MsPASSError(
                "scheduler_host should be a string but " +
                str(type(scheduler_host)) + " is found.",
                "Fatal",
            )
        if job_name is not None and not type(job_name) is str:
            raise MsPASSError(
                "job_name should be a string but " + str(type(job_name)) +
                " is found.",
                "Fatal",
            )
        if database_name is not None and not type(database_name) is str:
            raise MsPASSError(
                "database_name should be a string but " +
                str(type(database_name)) + " is found.",
                "Fatal",
            )
        # collection should be a string
        if collection is not None and type(collection) is not str:
            raise MsPASSError(
                "collection should be a string but " + str(type(collection)) +
                " is found.",
                "Fatal",
            )

        # check env variables
        MSPASS_DB_ADDRESS = os.environ.get("MSPASS_DB_ADDRESS")
        MONGODB_PORT = os.environ.get("MONGODB_PORT")
        MSPASS_SCHEDULER = os.environ.get("MSPASS_SCHEDULER")
        MSPASS_SCHEDULER_ADDRESS = os.environ.get("MSPASS_SCHEDULER_ADDRESS")
        DASK_SCHEDULER_PORT = os.environ.get("DASK_SCHEDULER_PORT")
        SPARK_MASTER_PORT = os.environ.get("SPARK_MASTER_PORT")

        # create a database client
        # priority: parameter -> env -> default
        database_host_has_port = False
        if database_host:
            database_address = database_host
            # check if database_host contains port number already
            if ":" in database_address:
                database_host_has_port = True

        elif MSPASS_DB_ADDRESS:
            database_address = MSPASS_DB_ADDRESS
        else:
            database_address = "localhost"
        # add port
        if not database_host_has_port and MONGODB_PORT:
            database_address += ":" + MONGODB_PORT

        try:
            self._db_client = DBClient(database_address)
            self._db_client.server_info()
        except Exception as err:
            raise MsPASSError(
                "Runntime error: cannot create a database client with: " +
                database_address,
                "Fatal",
            )

        # set default database name
        self._default_database_name = database_name
        self._default_schema = schema
        self._default_collection = collection

        # create a Global History Manager
        if schema:
            global_history_manager_db = Database(self._db_client,
                                                 database_name,
                                                 db_schema=schema)
        else:
            global_history_manager_db = Database(self._db_client,
                                                 database_name)
        self._global_history_manager = GlobalHistoryManager(
            global_history_manager_db, job_name, collection=collection)

        # set scheduler
        if scheduler:
            self._scheduler = scheduler
        elif MSPASS_SCHEDULER:
            self._scheduler = MSPASS_SCHEDULER
        else:
            self._scheduler = "dask"

        # scheduler configuration
        if self._scheduler == "spark":
            scheduler_host_has_port = False
            if scheduler_host:
                self._spark_master_url = scheduler_host
                # add spark:// prefix if not exist
                if "spark://" not in scheduler_host:
                    self._spark_master_url = "spark://" + self._spark_master_url
                # check if spark host address contains port number already
                if self._spark_master_url.count(":") == 2:
                    scheduler_host_has_port = True

            elif MSPASS_SCHEDULER_ADDRESS:
                self._spark_master_url = MSPASS_SCHEDULER_ADDRESS
                # add spark:// prefix if not exist
                if "spark://" not in MSPASS_SCHEDULER_ADDRESS:
                    self._spark_master_url = "spark://" + self._spark_master_url
            else:
                self._spark_master_url = "local"

            # add port number
            # 1. not the default 'local'
            # 2. scheduler_host and does not contain port number
            # 3. SPARK_MASTER_PORT exists
            if ((scheduler_host or MSPASS_SCHEDULER_ADDRESS)
                    and not scheduler_host_has_port and SPARK_MASTER_PORT):
                self._spark_master_url += ":" + SPARK_MASTER_PORT

            # sanity check
            try:
                spark = (SparkSession.builder.appName("mspass").master(
                    self._spark_master_url).getOrCreate())
                self._spark_context = spark.sparkContext
            except Exception as err:
                raise MsPASSError(
                    "Runntime error: cannot create a spark configuration with: "
                    + self._spark_master_url,
                    "Fatal",
                )

        elif self._scheduler == "dask":
            # if no defind scheduler_host and no MSPASS_SCHEDULER_ADDRESS, use local cluster to create a client
            if not scheduler_host and not MSPASS_SCHEDULER_ADDRESS:
                self._dask_client = DaskClient()
            else:
                scheduler_host_has_port = False
                # set host
                if scheduler_host:
                    self._dask_client_address = scheduler_host
                    # check if scheduler_host contains port number already
                    if ":" in scheduler_host:
                        scheduler_host_has_port = True
                else:
                    self._dask_client_address = MSPASS_SCHEDULER_ADDRESS

                # add port
                if not scheduler_host_has_port and DASK_SCHEDULER_PORT:
                    self._dask_client_address += ":" + DASK_SCHEDULER_PORT
                else:
                    # use to port 8786 by default if not specified
                    self._dask_client_address += ":8786"
                # sanity check
                try:
                    self._dask_client = DaskClient(self._dask_client_address)
                except Exception as err:
                    raise MsPASSError(
                        "Runntime error: cannot create a dask client with: " +
                        self._dask_client_address,
                        "Fatal",
                    )

    def get_database_client(self):
        """
        Get the database client in the global history manager

        :return: :class:`mspasspy.db.database.Database`
        """
        return self._db_client

    def get_database(self, database_name=None):
        """
        Get a database by database_name, if database_name is not specified, use the default one

        :param database_name: the name of database
        :type database_name: :class:`str`
        :return: :class:`mspasspy.db.database.Database`
        """
        if not database_name:
            return Database(self._db_client, self._default_database_name)
        return Database(self._db_client, database_name)

    def get_global_history_manager(self):
        """
        Get the global history manager with this client

        :return: :class:`mspasspy.global_history.manager.GlobalHistoryManager`
        """
        return self._global_history_manager

    def get_scheduler(self):
        """
        Get the scheduler(spark/dask) with this client

        :return: :class:`pyspark.SparkContext`/:class:`dask.distributed.Client`
        """
        if self._scheduler == "spark":
            return self._spark_context
        else:
            return self._dask_client

    def set_database_client(self, database_host, database_port=None):
        """
        Set a database client by database_host(and database_port)

        :param database_host: the host address of database client
        :type database_host: :class:`str`
        :param database_port: the port of database client
        :type database_port: :class:`str`
        """
        database_host_has_port = False
        database_address = database_host
        # check if port is already in the database_host address
        if ":" in database_host:
            database_host_has_port = True
        # add port
        if not database_host_has_port and database_port:
            database_address += ":" + database_port
        # sanity check
        temp_db_client = self._db_client
        try:
            self._db_client = DBClient(database_address)
            self._db_client.server_info()
        except Exception as err:
            # restore the _db_client
            self._db_client = temp_db_client
            raise MsPASSError(
                "Runntime error: cannot create a database client with: " +
                database_address,
                "Fatal",
            )

    def set_global_history_manager(self,
                                   history_db,
                                   job_name,
                                   collection=None):
        """
        Set a global history manager by history_db, job_name(and collection)

        :param history_db: the database will be set in the global history manager
        :type history_db: :class:`mspasspy.db.database.Database`
        :param job_name: the job name will be set in the global history manager
        :type job_name: :class:`str`
        :param collection: the collection name will be set in the history_db
        :type collection: :class:`str`
        """
        if not isinstance(history_db, Database):
            raise TypeError(
                "history_db should be a mspasspy.db.Database but " +
                str(type(history_db)) + " is found.")
        if not type(job_name) is str:
            raise TypeError("job_name should be a string but " +
                            str(type(job_name)) + " is found.")
        if collection is not None and type(collection) is not str:
            raise TypeError("collection should be a string but " +
                            str(type(collection)) + " is found.")

        self._global_history_manager = GlobalHistoryManager(
            history_db, job_name, collection=collection)

    def set_scheduler(self, scheduler, scheduler_host, scheduler_port=None):
        """
        Set a scheduler by scheduler type, scheduler_host(and scheduler_port)

        :param scheduler: the scheduler type, should be either dask or spark
        :type scheduler: :class:`str`
        :param scheduler_host: the host address of scheduler
        :type scheduler_host: :class:`str`
        :param scheduler_port: the port of scheduler
        :type scheduler_port: :class:`str`
        """
        if scheduler != "dask" and scheduler != "spark":
            raise MsPASSError(
                "scheduler should be either dask or spark but " +
                str(scheduler) + " is found.",
                "Fatal",
            )

        prev_scheduler = self._scheduler
        self._scheduler = scheduler
        if scheduler == "spark":
            scheduler_host_has_port = False

            self._spark_master_url = scheduler_host
            # add spark:// prefix if not exist
            if "spark://" not in scheduler_host:
                self._spark_master_url = "spark://" + self._spark_master_url
            # check if spark host address contains port number already
            if self._spark_master_url.count(":") == 2:
                scheduler_host_has_port = True

            # add port
            if not scheduler_host_has_port and scheduler_port:
                self._spark_master_url += ":" + scheduler_port

            # sanity check
            prev_spark_context = None
            prev_spark_conf = None
            if hasattr(self, "_spark_context"):
                prev_spark_context = self._spark_context
                prev_spark_conf = self._spark_context.getConf()
            try:
                if hasattr(self, "_spark_context") and isinstance(
                        self._spark_context, SparkContext):
                    # update the confinguration
                    spark_conf = self._spark_context._conf.setMaster(
                        self._spark_master_url)
                else:
                    spark_conf = (SparkConf().setAppName("mspass").setMaster(
                        self._spark_master_url))
                # stop the previous spark context
                # FIXME if the new context does not start, we shouldn't stop the previous here.
                # if prev_spark_context:
                #    prev_spark_context.stop()
                # create a new spark context -> might cause error so that execute exception code
                spark = SparkSession.builder.config(
                    conf=spark_conf).getOrCreate()
                self._spark_context = spark.sparkContext
            except Exception as err:
                # restore the spark context by the previous spark configuration
                if prev_spark_conf:
                    self._spark_context = SparkContext.getOrCreate(
                        conf=prev_spark_conf)
                # restore the scheduler type
                if self._scheduler == "spark" and prev_scheduler == "dask":
                    self._scheduler = prev_scheduler
                raise MsPASSError(
                    "Runntime error: cannot create a spark configuration with: "
                    + self._spark_master_url,
                    "Fatal",
                )
            # close previous dask client if success
            if hasattr(self, "_dask_client"):
                del self._dask_client

        elif scheduler == "dask":
            scheduler_host_has_port = False
            self._dask_client_address = scheduler_host
            # check if scheduler_host contains port number already
            if ":" in scheduler_host:
                scheduler_host_has_port = True

            # add port
            if not scheduler_host_has_port:
                if scheduler_port:
                    self._dask_client_address += ":" + scheduler_port
                else:
                    # use to port 8786 by default if not specified
                    self._dask_client_address += ":8786"

            # sanity check
            prev_dask_client = None
            if hasattr(self, "_dask_client"):
                prev_dask_client = self._dask_client
            try:
                # create a new dask client
                self._dask_client = DaskClient(self._dask_client_address)
            except Exception as err:
                # restore the dask client if exists
                if prev_dask_client:
                    self._dask_client = prev_dask_client
                # restore the scheduler type
                if self._scheduler == "dask" and prev_scheduler == "spark":
                    self._scheduler = prev_scheduler
                raise MsPASSError(
                    "Runntime error: cannot create a dask client with: " +
                    self._dask_client_address,
                    "Fatal",
                )
            # remove previous spark context if success setting new dask client
            if hasattr(self, "_spark_context"):
                del self._spark_context
Exemple #10
0
def main(args=None):
    # As a script that would be run from the shell we let
    # any functions below that throw exception do so and assume they
    # will write a message that can help debug what went wrong
    if args is None:
        args = sys.argv[1:]
    parser = argparse.ArgumentParser(
        prog="dbverify",
        usage=
        "%(prog)s dbname [-t TEST -c [collection ...] -n [normalize ... ] -error_limit n -v]",
        description="MsPASS database verify program",
    )
    parser.add_argument(
        "dbname",
        metavar="dbname",
        type=str,
        help="MongoDB database name on which to run tests",
    )
    parser.add_argument(
        "-t",
        "--test",
        action="store",
        type=str,
        default="normalization",
        help="Select which test to run.  " +
        "Current options:  normalization, required, schema_check",
    )
    parser.add_argument(
        "-c",
        "--collection",
        action="store",
        nargs="*",
        default=["wf_TimeSeries"],
        help="Collection(s) on which the test is to be run.  " +
        "Only schema_check supports multiple collections in one run",
    )
    parser.add_argument(
        "-n",
        "--normalize",
        nargs="*",
        default=["site_id", "channel_id", "source_id"],
        help="List of normalization keys to test\n" +
        "(Used only for -test normalization option",
    )
    parser.add_argument(
        "-r",
        "--require",
        nargs="*",
        default=[],
        help="List of keys of required attributes for required test",
    )
    parser.add_argument(
        "-e",
        "--error_limit",
        action="store",
        type=int,
        default=1000,
        help="Set error limit - stop checking when this many errors are found\n"
        + "Default is 1000",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="store_true",
        help=
        "When used print offending values.  Otherwise just return a summary",
    )

    args = parser.parse_args(args)
    test_to_run = args.test
    dbname = args.dbname
    dbclient = DBClient()
    db = Database(dbclient, dbname)
    col_to_test = args.collection
    normalize = args.normalize
    reqlist = args.require
    verbose = args.verbose
    elimit = args.error_limit

    # If python had a switch case it would be used here.  this
    # is the list of known tests.  the program can only run one
    # test per execution.  Intentional to make output more readable
    if test_to_run == "normalization":
        if len(col_to_test) > 1:
            print(
                "WARNING:  normalization test can only be run on one collection at a time"
            )
            print("Parsed a list with the following contents:  ", col_to_test)
            print("Running test on the first item in that list")
        col = col_to_test[0]
        if not isinstance(col, str):
            print("Invalid value parsed for -c option=", col)
            exit(-1)
        run_check_links(db, col, normalize, elimit, verbose)
    elif test_to_run == "required":
        if len(col_to_test) > 1:
            print(
                "WARNING:  required test can only be run on one collection at a time"
            )
            print("Parsed a list with the following contents:  ", col_to_test)
            print("Running test on the first item in that list")
        col = col_to_test[0]
        if not isinstance(col, str):
            print("Invalid value parsed for -c option=", col_to_test)
            exit(-1)
        if len(reqlist) == 0:
            # Depends on default being an empty list. For default
            # case run this small function.
            # This is currently a funtion above with const list values
            # returned for each known collection.  It may eventually
            # be replaced a function using the schema
            required_list = get_required(col)
        else:
            required_list = reqlist
        run_check_required(db, col, required_list, elimit, verbose)
    elif test_to_run == "schema_check":
        for col in col_to_test:
            run_check_attribute_types(db, col, elimit, verbose)
    else:
        print("Unrecognized value for --test value parsed=", test_to_run)
        print("Must be one of:  normalization, required, or schema_check")