예제 #1
0
def crds(authenticated_user):
    get_local_bucket(empty=True)
    creds = StorageCreds(user=authenticated_user, service_url="storage")
    drive = Drive(creds=creds, name="test_drive")
    filepath = os.path.join(
        os.path.dirname(__file__),
        "../../../tests/data/proc_test_data/CRDS/bsd.picarro.1minute.248m.dat",
    )
    filemeta = drive.upload(filepath)

    par = PAR(location=filemeta.location(), user=authenticated_user)

    hugs = Service(service_url="hugs")
    par_secret = hugs.encrypt_data(par.secret())

    auth = Authorisation(resource="process", user=authenticated_user)

    args = {
        "authorisation": auth.to_data(),
        "par": {
            "data": par.to_data()
        },
        "par_secret": {
            "data": par_secret
        },
        "data_type": "CRDS",
        "source_name": "bsd.picarro.1minute.248m",
    }

    hugs.call_function(function="process", args=args)
예제 #2
0
def process(args):
    """ Take a PAR from an uploaded file and process the data

        Args:
            args (dict): Dictionary of JSON serialised objects to be
            used by processing functions
        Returns:
            dict: Dictionary of results of processing
    """
    data_type = args["data_type"]

    data_par = PAR.from_data(args["par"]["data"])
    data_secret = args["par_secret"]["data"]

    auth = args["authorisation"]
    authorisation = Authorisation.from_data(auth)

    # Verify that this process had authorisation to be called
    authorisation.verify("process")

    hugs = get_this_service(need_private_access=True)

    data_secret = hugs.decrypt_data(data_secret)
    data_filename = data_par.resolve(secret=data_secret)
    # Here we're downloading the data to the tmp directory
    # Be good if we could load it directly from the object store
    data_file = data_filename.download(dir="/tmp")

    if data_type == "GC":
        precision_par = PAR.from_data(args["par"]["precision"])
        precision_secret = args["par_secret"]["precision"]
        precision_secret = hugs.decrypt_data(precision_secret)
        precision_filename = precision_par.resolve(precision_secret)
        precision_file = precision_filename.download(dir="/tmp")
        site = args["site"]
        instrument = args["instrument"]

        data_file = data_file, precision_file
    else:
        site = None
        instrument = None

    if "overwrite" in args:
        overwrite = args["overwrite"]
    else:
        overwrite = False

    source_name = args["source_name"]

    results = process_data(
        data_file=data_file,
        source_name=source_name,
        data_type=data_type,
        site=site,
        instrument_name=instrument,
        overwrite=overwrite,
    )

    return {"results": results}
예제 #3
0
    def load(self, par_uid, secret=None):
        """Load and return the PAR and identifiers associated with
           the passed UID, locked with the passed secret
        """
        # validate that the UID actually looks like a UID. This
        # should prevent attacks that try weird UIDs
        from Acquire.ObjectStore import validate_is_uid \
            as _validate_is_uid
        _validate_is_uid(par_uid)

        from Acquire.ObjectStore import ObjectStore as _ObjectStore
        from Acquire.Service import get_service_account_bucket \
            as _get_service_account_bucket

        try:
            key = "%s/%s" % (_par_root, par_uid)
            bucket = _get_service_account_bucket()
            data = _ObjectStore.get_object_from_json(bucket, key)

            from Acquire.Client import PAR as _PAR
            import json as _json
            par = _PAR.from_data(data["par"])
            identifiers = _json.loads(data["identifiers"])

            if secret != data["secret"]:
                raise PermissionError()
        except:
            raise PermissionError("There is no valid PAR at ID '%s'" % par_uid)

        if par.expired():
            raise PermissionError(
                "There is no valid PAR at ID '%s' as it has expired" % par_uid)

        return (par, identifiers)
예제 #4
0
def test_process_CRDS(authenticated_user, tempdir):
    creds = StorageCreds(user=authenticated_user, service_url="storage")
    drive = Drive(creds=creds, name="test_drive")
    filepath = os.path.join(
        os.path.dirname(__file__),
        "../../../tests/data/proc_test_data/CRDS/bsd.picarro.1minute.248m.dat",
    )
    filemeta = drive.upload(filepath)

    Path("/tmp/bsd.picarro.1minute.248m.dat").unlink(missing_ok=True)

    par = PAR(location=filemeta.location(), user=authenticated_user)

    hugs = Service(service_url="hugs")
    par_secret = hugs.encrypt_data(par.secret())

    auth = Authorisation(resource="process", user=authenticated_user)

    args = {
        "authorisation": auth.to_data(),
        "par": {
            "data": par.to_data()
        },
        "par_secret": {
            "data": par_secret
        },
        "data_type": "CRDS",
        "source_name": "bsd.picarro.1minute.248m",
    }

    response = hugs.call_function(function="process", args=args)

    expected_keys = [
        "bsd.picarro.1minute.248m_ch4",
        "bsd.picarro.1minute.248m_co",
        "bsd.picarro.1minute.248m_co2",
    ]

    results = response["results"]["bsd.picarro.1minute.248m.dat"]

    return False

    assert sorted(results.keys()) == expected_keys
예제 #5
0
def run():
    parser = argparse.ArgumentParser(
        description="Run and watch a job on a HPC resource")
    parser.add_argument("j", help="JSON data filename")
    args = parser.parse_args()

    json_filename = args.j
    with open(json_filename, "r") as f:
        job_data = json.load(f)

    par_data = job_data["par"]
    par_secret = job_data["par_secret"]

    try:
        compilation_command = job_data["compilation_command"]
    except KeyError:
        compilation_command = None

    run_command = job_data["run_command"]

    # Make the output folder
    fpath = Path(__file__).resolve().parent.joinpath("output")
    fpath.mkdir(parents=True)

    par = PAR.from_data(data=par_data)
    drive = par.resolve(secret=par_secret)

    # Download any data files and moved them to the input folders
    files = drive.list_files()
    for f in files:
        filename = f.filename()
        drive.download(filename=filename)

    # Split the compilation command
    if compilation_command is not None:
        cmd_list = compilation_command.split()
        # Run the compilation command and set the current working directory
        # to our application code location in "app"
        res = subprocess.run(cmd_list, stderr=True)

        if res.returncode != 0:
            raise subprocess.CalledProcessError("Compilation error : ",
                                                res.stderr)

    run_command = run_command.split()
    # Run the actual code
    runcmd_res = subprocess.run(run_command, stderr=True)

    if runcmd_res.returncode != 0:
        raise subprocess.CalledProcessError("Error running application : ",
                                            runcmd_res.stderr)

    # Upload everything in the output directory to the cloud drive
    drive.upload("output")
예제 #6
0
def run(args):
    """Create a new PAR based on the information supplied
       in the passed half-created PAR, and the supplied
       Authorisation. This will return the URL that will
       need to be used by the PAR to access the required
       data. This will be encrypted using the supplied
       PublicKey
    """

    auth = Authorisation.from_data(args["authorisation"])
    par = PAR.from_data(args["par"])
    secret = args["secret"]

    registry = PARRegistry()
    par_uid = registry.register(par=par, authorisation=auth, secret=secret)

    result = {"par_uid": par_uid}

    return result
예제 #7
0
def run(args):
    """This function receives the request to submit a job on the
       compute service. It processes the request and performs
       everything needed to actually submit the job. This
       returns confirmation the job has been submitted, as
       well as some metadata about the job
    """
    worksheet_uid = str(args["worksheet_uid"])
    request = RunRequest.from_data(args["request"])
    par = PAR.from_data(args["par"])
    secret = args["secret"]
    cheque = Cheque.from_data(args["cheque"])

    job = ComputeJob.submit(worksheet_uid=worksheet_uid,
                            request=request,
                            par=par,
                            secret=secret,
                            cheque=cheque)

    return {"uid": job.uid()}
예제 #8
0
    def from_data(data):
        """Return a ComputeJob constructed from a json-deserialised
           dictionary
        """
        if data is None or len(data) == 0:
            return ComputeJob()

        from Acquire.ObjectStore import string_to_list as _string_to_list
        from Acquire.Access import RunRequest as _RunRequest
        from Acquire.Client import PAR as _PAR
        from Acquire.Accounting import CreditNote as _CreditNote

        job = ComputeJob()

        job._uid = str(data["uid"])
        job._credit_notes = _string_to_list(data["credit_notes"], _CreditNote)
        job._par = _PAR.from_data(data["par"])
        job._secret = str(data["secret"])
        job._request = _RunRequest.from_data(data["request"])

        return job
예제 #9
0
def test_process_GC(authenticated_user, tempdir):
    creds = StorageCreds(user=authenticated_user, service_url="storage")
    drive = Drive(creds=creds, name="test_drive")
    data_filepath = os.path.join(
        os.path.dirname(__file__),
        "../../../tests/data/proc_test_data/GC/capegrim-medusa.18.C",
    )
    precision_filepath = os.path.join(
        os.path.dirname(__file__),
        "../../../tests/data/proc_test_data/GC/capegrim-medusa.18.precisions.C",
    )

    Path("/tmp/capegrim-medusa.18.C").unlink(missing_ok=True)
    Path("/tmp/capegrim-medusa.18.precisions.C").unlink(missing_ok=True)

    data_meta = drive.upload(data_filepath)
    precision_meta = drive.upload(precision_filepath)

    data_par = PAR(location=data_meta.location(), user=authenticated_user)
    precision_par = PAR(location=precision_meta.location(),
                        user=authenticated_user)

    hugs = Service(service_url="hugs")
    data_secret = hugs.encrypt_data(data_par.secret())
    precision_secret = hugs.encrypt_data(precision_par.secret())

    auth = Authorisation(resource="process", user=authenticated_user)

    args = {
        "authorisation": auth.to_data(),
        "par": {
            "data": data_par.to_data(),
            "precision": precision_par.to_data()
        },
        "par_secret": {
            "data": data_secret,
            "precision": precision_secret
        },
        "data_type": "GCWERKS",
        "source_name": "capegrim-medusa",
        "site": "CGO",
        "instrument": "medusa",
    }

    response = hugs.call_function(function="process", args=args)

    result_keys = (sorted(
        response["results"]["capegrim-medusa.18.C"].keys()))[:8]

    expected_keys = [
        "capegrim-medusa.18_C4F10",
        "capegrim-medusa.18_C6F14",
        "capegrim-medusa.18_CCl4",
        "capegrim-medusa.18_CF4",
        "capegrim-medusa.18_CFC-11",
        "capegrim-medusa.18_CFC-112",
        "capegrim-medusa.18_CFC-113",
        "capegrim-medusa.18_CFC-114",
    ]

    assert result_keys == expected_keys
예제 #10
0
    def create_job(
        self,
        auth_user,
        requirements,
        key_password,
        data_files,
        hugs_url=None,
        storage_url=None,
    ):
        """ Create a job

            Args:
                auth_user (Acquire.User): Authenticated Acquire user

                The following keys are required:
                    "hostname", "username", "name", "run_command", "partition", "n_nodes", "n_tasks_per_node",
                    "n_cpus_per_task", "memory_req", "job_duration"
                where partition must be one of:
                    "cpu_test", "dcv", "gpu", "gpu_veryshort", "hmem", "serial", "test", "veryshort"

                Example:
                    requirements = {"hostname": hostname, "username": username, "name": "test_job,
                                    "n_nodes": 2,"n_tasks_per_node": 2,
                                    "n_cpus_per_task": 2, "memory": "128G", ...}

                requirements (dict): Dictionary containing job details and requested resources
                key_password (str): Password for private key used to access the HPC
                data_files (dict): Data file(s) to be uploaded to the cloud drive to
                run the simulation. Simulation code files should be given in the "app" key and data
                files in the "data" key

                TODO - having to pass in a password and get it through to Paramiko seems
                long winded, is there a better way to do this?

                hugs_url (str): URL of HUGS service
                storage_url (str): URL of storage service
            Returns:
                dict: Dictionary containing information regarding job running on resource
                This will contain the PAR for access for data upload and download.
        """
        from Acquire.Client import (
            Drive,
            Service,
            PAR,
            Authorisation,
            StorageCreds,
            Location,
            ACLRule,
        )
        from Acquire.ObjectStore import create_uuid
        import datetime
        import os

        if self._service is None:
            raise PermissionError("Cannot use a null service")

        if storage_url is None:
            storage_url = self._service_url + "/storage"

        if hugs_url is None:
            hugs_url = self._service_url + "/hugs"

        if not isinstance(data_files["app"], list):
            data_files["app"] = [data_files["app"]]

        try:
            if not isinstance(data_files["data"], list):
                data_files["data"] = [data_files["data"]]
        except KeyError:
            pass

        # Get an authorisaton to pass to the service
        hugs = Service(service_url=hugs_url)
        # Credentials to create the cloud storage drive
        creds = StorageCreds(user=auth_user, service_url=storage_url)

        # Append a shortened UUID to the job name to ensure we don't multiple drives with the same name
        short_uuid = create_uuid(short_uid=True)

        job_name = requirements["name"]
        job_name = f"{job_name.lower()}_{short_uuid}"

        # Create a cloud drive for the input and output data to be written to
        drive = Drive(creds=creds, name=job_name)

        # Check the size of the files and if we want to use the chunk uploader
        # Now we want to upload the files to the cloud drive we've created for this job
        chunk_limit = 50 * 1024 * 1024

        # Store the metadata for the uploaded files
        uploaded_files = {"app": {}, "data": {}}
        # These probably won't be very big so don't check their size
        for f in data_files["app"]:
            file_meta = drive.upload(f, dir="app")
            uploaded_files["app"][f] = file_meta

        # We might not have any data files to upload
        try:
            for f in data_files["data"]:
                filesize = os.path.getsize(f)

                if filesize < chunk_limit:
                    file_meta = drive.upload(f, dir="data")
                else:
                    file_meta = drive.chunk_upload(f, dir="data")

                uploaded_files["data"][f] = file_meta
        except KeyError:
            pass

        auth = Authorisation(resource="job_runner", user=auth_user)
        # Create a PAR with a long lifetime here and return a version to the user
        # and another to the server to allow writing of result data
        drive_guid = drive.metadata().guid()
        location = Location(drive_guid=drive_guid)

        # Read the duration from the requirements dictionary

        # TODO - add in some reading of the duration
        # try:
        #     duration = requirements["duration"]
        #     par_expiry = datetime.datetime

        par_lifetime = datetime.datetime.now() + datetime.timedelta(days=1)

        # Create an ACL rule for this PAR so we can read and write to it
        aclrule = ACLRule.owner()
        par = PAR(
            location=location,
            user=auth_user,
            aclrule=aclrule,
            expires_datetime=par_lifetime,
        )

        par_secret = par.secret()
        encryped_par_secret = hugs.encrypt_data(par_secret)

        # Encrypt the password we use to decrypt the private key used to access the HPC cluster
        # TODO - is this a sensible way of doing this?
        encrypted_password = hugs.encrypt_data(key_password)

        par_data = par.to_data()

        args = {}
        args["authorisation"] = auth.to_data()
        args["par"] = par_data
        args["par_secret"] = encryped_par_secret
        args["requirements"] = requirements
        args["key_password"] = encrypted_password

        function_response = self._service.call_function(function="job_runner",
                                                        args=args)

        response = {}
        response["function_response"] = function_response
        response["par"] = par_data
        response["par_secret"] = par_secret
        response["upload_data"] = uploaded_files

        return response
예제 #11
0
def test_drive_par(authenticated_user, tempdir):
    drive_name = "test å∫ç∂ pars"
    creds = StorageCreds(user=authenticated_user, service_url="storage")

    drive = Drive(name=drive_name, creds=creds)

    drive.upload(filename=__file__, uploaded_name="tmp_test.py")

    downloaded_name = drive.download(filename="tmp_test.py", dir=tempdir)

    assert (_same_file(__file__, downloaded_name))

    drive_guid = drive.metadata().guid()

    location = Location(drive_guid=drive_guid)

    par = PAR(location=location,
              user=authenticated_user,
              aclrule=ACLRule.reader())

    par_drive = par.resolve()
    assert (par_drive.metadata().acl() == ACLRule.reader())
    assert (par_drive.metadata().uid() == drive.metadata().uid())

    files = par_drive.list_files()
    assert (len(files) == 1)
    assert (files[0].filename() == "tmp_test.py")

    downloaded_name = files[0].open().download(dir=tempdir, force_par=True)

    assert (_same_file(__file__, downloaded_name))

    par2 = PAR(location=location,
               user=authenticated_user,
               aclrule=ACLRule.writer())

    par_drive = par2.resolve()

    assert (par_drive.metadata().acl() == ACLRule.writer())
    assert (par_drive.metadata().uid() == drive.metadata().uid())

    files = par_drive.list_files()
    assert (len(files) == 1)
    assert (files[0].filename() == "tmp_test.py")

    par_drive.upload(filename=__file__, uploaded_name="tmp_test2.py")

    files = par_drive.list_files()
    assert (len(files) == 2)
    f = {}
    f[files[0].filename()] = files[0]
    f[files[1].filename()] = files[1]
    files = f

    assert ("tmp_test.py" in files)
    assert ("tmp_test2.py" in files)

    downloaded_name = files["tmp_test2.py"].open().download(dir=tempdir)

    assert (_same_file(__file__, downloaded_name))

    par = PAR(location=files["tmp_test.py"].location(),
              user=authenticated_user,
              aclrule=ACLRule.reader())

    par_file = par.resolve()

    assert (par_file.metadata().acl() == ACLRule.reader())

    downloaded_name = par_file.download(dir=tempdir)

    assert (_same_file(__file__, downloaded_name))

    with pytest.raises(PermissionError):
        par_file.upload(__file__)

    par = PAR(location=files["tmp_test.py"].location(),
              user=authenticated_user,
              aclrule=ACLRule.writer())

    par_file = par.resolve()

    assert (par_file.metadata().acl() == ACLRule.writer())

    par_file.upload(__file__)
예제 #12
0
파일: _process.py 프로젝트: hugs-cloud/hugs
    def process_files(
        self,
        user,
        files,
        data_type,
        source_name=None,
        overwrite=False,
        hugs_url=None,
        storage_url=None,
        datasource=None,
        site=None,
        instrument=None,
    ):
        """ Process the passed file(s)

            Args:
                user (User): Authenticated Acquire User
                files (str, list): Path of files to be processed
                data_type (str): Type of data to be processed (CRDS, GC etc)
                hugs_url (str): URL of HUGS service. Currently used for testing
                datasource (str): Datasource name or UUID
                This may be removed in the future.
                storage_url (str): URL of storage service. Currently used for testing
                This may be removed in the future.
                site (str, default=None): Name of site, three letter code or long name
                instrument (str, default=None): If no instrument name is passed we will attempt
                to find it from the filename.
            Returns:
                dict: UUIDs of Datasources storing data of processed files keyed by filename
        """
        data_type = data_type.upper()

        if self._service is None:
            raise PermissionError("Cannot use a null service")

        if not isinstance(files, list):
            files = [files]

        if data_type.upper() == "GC":
            if not all(isinstance(item, tuple) for item in files):
                return TypeError(
                    "If data type is GC, a list of tuples for data and precision filenames must be passed"
                )

            files = [(Path(f), Path(p)) for f, p in files]
        else:
            files = [Path(f) for f in files]

        if storage_url is None:
            storage_url = self._service_url + "/storage"

        if hugs_url is None:
            hugs_url = self._service_url + "/hugs"

        # # Take the filename without the file extension
        # source_name = [os.path.splitext((filepath.name).split("/")[-1])[0] for filepath in files]

        hugs = Service(service_url=hugs_url)
        creds = StorageCreds(user=user, service_url=storage_url)
        drive = Drive(creds=creds, name="test_drive")
        auth = Authorisation(resource="process", user=user)

        # Here we'll need special cases for different data types. As GC requires
        # both the data file and precision data and they need to be kept together
        # for use in processing.
        # We can maybe reconsider the way this is done if there ends up being a lot of test
        # cases and this gets a bit clunky
        results = {}
        for file in files:
            if data_type == "GC":
                if source_name is None:
                    source_name = file[0].stem

                if site is None:
                    site = source_name.split(".")[0]
                    if "-" in site and data_type == "GC":
                        site = site.split("-")[0]

                filemeta = drive.upload(file[0])
                par = PAR(location=filemeta.location(), user=user)
                par_secret = hugs.encrypt_data(par.secret())

                prec_meta = drive.upload(file[1])
                prec_par = PAR(location=prec_meta.location(), user=user)
                prec_par_secret = hugs.encrypt_data(prec_par.secret())

                args = {
                    "authorisation": auth.to_data(),
                    "par": {"data": par.to_data(), "precision": prec_par.to_data()},
                    "par_secret": {"data": par_secret, "precision": prec_par_secret},
                    "data_type": data_type,
                    "datasource": datasource,
                    "source_name": source_name,
                    "overwrite": overwrite,
                    "site": site,
                    "instrument": instrument,
                }
            else:
                filemeta = drive.upload(file)
                par = PAR(location=filemeta.location(), user=user)
                par_secret = hugs.encrypt_data(par.secret())

                args = {
                    "authorisation": auth.to_data(),
                    "par": {"data": par.to_data()},
                    "par_secret": {"data": par_secret},
                    "data_type": data_type,
                    "datasource": datasource,
                    "source_name": source_name,
                    "overwrite": overwrite,
                }

            # If we try to upload many files we don't want it to fail if a single
            # file contains overlapping data
            try:
                response = self._service.call_function(function="process", args=args)
                results.update(response["results"])
            except ValueError as err:
                results[file.name] = err

        return results