Ejemplo n.º 1
0
def test_create_condensed_dataset_job(enqueue_job_mock, create_with_upload,
                                      monkeypatch, ckan_config, tmpdir):
    monkeypatch.setitem(ckan_config, 'ckan.storage_path', str(tmpdir))
    monkeypatch.setattr(ckan.lib.uploader, '_storage_path', str(tmpdir))

    user = factories.User()
    owner_org = factories.Organization(users=[{
        'name': user['id'],
        'capacity': 'admin'
    }])
    # Note: `call_action` bypasses authorization!
    # create 1st dataset
    create_context = {
        'ignore_auth': False,
        'user': user['name'],
        'api_version': 3
    }
    dataset = make_dataset(create_context,
                           owner_org,
                           with_resource=False,
                           activate=False)
    content = (data_dir / "calibration_beads_47.rtdc").read_bytes()
    result = create_with_upload(
        content,
        'test.rtdc',
        url="upload",
        package_id=dataset["id"],
        context=create_context,
    )
    path = dcor_shared.get_resource_path(result["id"])
    cond = path.with_name(path.name + "_condensed.rtdc")
    # existence of original uploaded file
    assert path.exists()
    # existence of condensed file
    assert cond.exists()
Ejemplo n.º 2
0
def dataset_state(key, data, errors, context):
    """If a dataset does not have any resources, it must be a draft"""
    data_dict = df.unflatten(data)

    if data[key] == "active":
        if "resources" not in data_dict or len(data_dict["resources"]) == 0:
            # The user wants to activate the dataset although it does not
            # contain any resources. This is not possible!
            raise toolkit.Invalid(
                "Cannot set state of dataset to 'active', because it does not "
                "contain any resources!")
        else:
            # Do not allow activating a dataset without at least one valid
            # .rtdc resource.
            # Note that DCOR-Aid first checks whether resource upload is
            # complete before uploading. If someone writes their own script
            # for uploading, they also have to use package_revise *after*
            # uploading the resources to set the state to "active".
            for res in data_dict["resources"]:
                if res["mimetype"] == "RT-DC":
                    rp = dcor_shared.get_resource_path(res["id"])
                    try:
                        with dclab.IntegrityChecker(rp) as ic:
                            insane = ic.sanity_check()
                            if not insane:
                                break
                    except ValueError:
                        # Unknown file format
                        pass
            else:
                raise toolkit.Invalid(
                    "Before activating a dataset, make sure that it contains "
                    "a valid .rtdc resource!")
Ejemplo n.º 3
0
def generate_condensed_resource_job(resource, override=False):
    """Generates a condensed version of the dataset"""
    path = get_resource_path(resource["id"])
    if resource["mimetype"] in DC_MIME_TYPES:
        wait_for_resource(path)
        cond = path.with_name(path.name + "_condensed.rtdc")
        if not cond.exists() or override:
            with CKANResourceFileLock(
                    resource_id=resource["id"],
                    locker_id="DCOR_generate_condensed") as fl:
                # The CKANResourceFileLock creates a lock file if not present
                # and then sets `is_locked` to True if the lock was acquired.
                # If the lock could not be acquired, that means that another
                # process is currently doing what we are attempting to do, so
                # we can just ignore this resource. The reason why I
                # implemented this is because I wanted to add an automated
                # background job for generating missing condensed files, but
                # then several processes would end up condensing the same
                # resource.
                if fl.is_locked:
                    # run in subprocess to circumvent memory leak
                    # https://github.com/ZELLMECHANIK-DRESDEN/dclab/issues/138
                    # condense(path_out=cond, path_in=path, check_suffix=False)
                    p = multiprocessing.Process(target=condense,
                                                args=(cond, path, False))
                    p.start()
                    p.join()
                    return True
    return False
Ejemplo n.º 4
0
def create_preview_job(resource, override=False):
    """Generate a *_preview.png file for a DC resource"""
    path = get_resource_path(resource["id"])
    wait_for_resource(path)
    mtype = resource.get('mimetype', '')
    if mtype in DC_MIME_TYPES:
        # only do this for rtdc data
        jpgpath = path.with_name(path.name + "_preview.jpg")
        if not jpgpath.exists() or override:
            generate_preview(path, jpgpath)
            return True
    return False
Ejemplo n.º 5
0
def set_sha256_job(resource):
    """Computes the sha256 hash and writes it to the resource metadata"""
    sha = str(resource.get("sha256", ""))  # can be bool sometimes
    if len(sha) != 64:  # only compute if necessary
        path = get_resource_path(resource["id"])
        wait_for_resource(path)
        file_hash = hashlib.sha256()
        with open(path, "rb") as fd:
            while True:
                data = fd.read(2**20)
                if not data:
                    break
                file_hash.update(data)
        sha256sum = file_hash.hexdigest()
        patch_resource_noauth(package_id=resource["package_id"],
                              resource_id=resource["id"],
                              data_dict={"sha256": sha256sum})
        return True
    return False
Ejemplo n.º 6
0
def set_format_job(resource):
    """Writes the correct format to the resource metadata"""
    mimetype = resource.get("mimetype")
    rformat = resource.get("format")
    if mimetype in DC_MIME_TYPES and rformat in [mimetype, None]:
        # (if format is already something like RT-FDC then we don't do this)
        path = get_resource_path(resource["id"])
        wait_for_resource(path)
        with dclab.rtdc_dataset.check.IntegrityChecker(path) as ic:
            if ic.has_fluorescence:
                fmt = "RT-FDC"
            else:
                fmt = "RT-DC"
        if rformat != fmt:  # only update if necessary
            patch_resource_noauth(package_id=resource["package_id"],
                                  resource_id=resource["id"],
                                  data_dict={"format": fmt})
            return True
    return False
Ejemplo n.º 7
0
def test_create_preview_job(enqueue_job_mock, create_with_upload, monkeypatch,
                            ckan_config, tmpdir):
    monkeypatch.setitem(ckan_config, 'ckan.storage_path', str(tmpdir))
    monkeypatch.setattr(ckan.lib.uploader, '_storage_path', str(tmpdir))

    user = factories.User()
    owner_org = factories.Organization(users=[{
        'name': user['id'],
        'capacity': 'admin'
    }])
    # Note: `call_action` bypasses authorization!
    # create 1st dataset
    create_context = {
        'ignore_auth': False,
        'user': user['name'],
        'api_version': 3
    }
    dataset = make_dataset(create_context,
                           owner_org,
                           with_resource=False,
                           activate=False)
    path = data_dir / "calibration_beads_47.rtdc"
    content = path.read_bytes()
    result = create_with_upload(
        content,
        'test.rtdc',
        url="upload",
        package_id=dataset["id"],
        context=create_context,
    )
    resource_path = dcor_shared.get_resource_path(result["id"])
    assert resource_path.exists()
    preview_path = resource_path.with_name(resource_path.name + "_preview.jpg")
    # give the background job a little time to complete
    for ii in range(100):
        if not preview_path.exists():
            time.sleep(0.1)
        else:
            assert preview_path.stat().st_size > 1000
            break
    else:
        raise ValueError("Preview generation timed out after 10s!")
Ejemplo n.º 8
0
def set_dc_config_job(resource):
    """Store all DC config metadata"""
    if (resource.get('mimetype') in DC_MIME_TYPES
            and resource.get("dc:setup:channel width", None) is None):
        path = get_resource_path(resource["id"])
        wait_for_resource(path)
        data_dict = {}
        with dclab.new_dataset(path) as ds:
            for sec in dclab.dfn.CFG_METADATA:
                if sec in ds.config:
                    for key in dclab.dfn.config_keys[sec]:
                        if key in ds.config[sec]:
                            dckey = 'dc:{}:{}'.format(sec, key)
                            value = ds.config[sec][key]
                            data_dict[dckey] = value
        patch_resource_noauth(package_id=resource["package_id"],
                              resource_id=resource["id"],
                              data_dict=data_dict)
        return True
    return False
Ejemplo n.º 9
0
def resource_has_condensed(resource_id):
    rpath = dcor_shared.get_resource_path(resource_id)
    return rpath.with_name(rpath.stem + "_condensed.rtdc").exists()
Ejemplo n.º 10
0
def dcserv(context, data_dict=None):
    """Serve DC data as json via the CKAN API

    Required parameters are 'id' (resource id) and
    'query' ('feature', 'feature_list', 'metadata', 'size',
    'trace', 'trace_list', 'valid').

    In case 'query=feature', the parameter 'feature' must
    be set to a valid feature name (e.g. 'feature=deform') and,
    for non-scalar features only, 'event' (event index within
    the dataset) must be set (e.g. 'event=47').
    In case 'query=trace', the 'trace' and 'event' must be set.
    In case 'query=valid', returns True if the resource exists
    and is a valid RT-DC dataset.

    The "result" value will either be a dictionary
    resembling RTDCBase.config (query=metadata),
    a list of available features (query=feature_list),
    or the requested data converted to a list (use
    numpy.asarray to convert back to a numpy array).
    """
    # Check required parameters
    if "query" not in data_dict:
        raise logic.ValidationError("Please specify 'query' parameter!")
    if "id" not in data_dict:
        raise logic.ValidationError("Please specify 'id' parameter!")

    # Perform all authorization checks for the resource
    logic.check_access("resource_show",
                       context=context,
                       data_dict={"id": data_dict["id"]})

    query = data_dict["query"]
    res_id = data_dict["id"]
    path = get_resource_path(res_id)

    # Check whether we actually have an .rtdc dataset
    if not is_rtdc_resource(res_id):
        raise logic.ValidationError(
            f"Resource ID {res_id} must be an .rtdc dataset!")

    if query == "feature":
        data = get_feature_data(data_dict, path)
    elif query == "feature_list":
        data = get_feature_list(path)
    elif query == "metadata":
        with dclab.rtdc_dataset.fmt_hdf5.RTDC_HDF5(path) as ds:
            data = json.loads(ds.config.tojson())
    elif query == "size":
        with dclab.rtdc_dataset.fmt_hdf5.RTDC_HDF5(path) as ds:
            data = len(ds)
    elif query == "trace":
        warnings.warn("A dc_serve client is using the 'trace' query!",
                      DeprecationWarning)
        # backwards-compatibility
        data_dict["query"] = "feature"
        data_dict["feature"] = "trace"
        data = get_feature_data(data_dict, path)
    elif query == "trace_list":
        with dclab.rtdc_dataset.fmt_hdf5.RTDC_HDF5(path) as ds:
            if "trace" in ds:
                data = sorted(ds["trace"].keys())
            else:
                data = []
    elif query == "valid":
        data = path.exists()
    else:
        raise logic.ValidationError(f"Invalid query parameter '{query}'!")

    return data