def to_cwl_tool_object(tool_path=None, tool_object=None, persisted_tool=None, strict_cwl_validation=True): schema_loader = _schema_loader(strict_cwl_validation) if tool_path is not None: cwl_tool = schema_loader.tool(path=tool_path) elif tool_object is not None: # Allow loading tools from YAML... from ruamel import yaml as ryaml import json as_str = json.dumps(tool_object) tool_object = ryaml.round_trip_load(as_str) from schema_salad import sourceline from schema_salad.ref_resolver import file_uri uri = file_uri(os.getcwd()) + "/" sourceline.add_lc_filename(tool_object, uri) tool_object, _ = schema_loader.raw_document_loader.resolve_all( tool_object, uri) raw_process_reference = schema_loader.raw_process_reference_for_object( tool_object, uri=uri) cwl_tool = schema_loader.tool( raw_process_reference=raw_process_reference, ) else: cwl_tool = ToolProxy.from_persistent_representation(persisted_tool) if isinstance(cwl_tool, int): raise Exception("Failed to load tool.") raw_tool = cwl_tool.tool # Apply Galaxy hacks to CWL tool representation to bridge semantic differences # between Galaxy and cwltool. _hack_cwl_requirements(cwl_tool) check_requirements(raw_tool) return cwl_tool_object_to_proxy(cwl_tool, tool_path=tool_path)
def _to_cwl_tool_object(tool_path=None, tool_object=None, cwl_tool_object=None, raw_process_reference=None, strict_cwl_validation=True, tool_directory=None, uuid=None): if uuid is None: uuid = str(uuid4()) schema_loader = _schema_loader(strict_cwl_validation) if raw_process_reference is None and tool_path is not None: assert cwl_tool_object is None assert tool_object is None raw_process_reference = schema_loader.raw_process_reference(tool_path) cwl_tool = schema_loader.tool( raw_process_reference=raw_process_reference, ) elif tool_object is not None: assert raw_process_reference is None assert cwl_tool_object is None # Allow loading tools from YAML... from ruamel import yaml as ryaml import json as_str = json.dumps(tool_object) tool_object = ryaml.round_trip_load(as_str) from schema_salad import sourceline from schema_salad.ref_resolver import file_uri path = tool_directory if path is None: path = os.getcwd() uri = file_uri(path) + "/" sourceline.add_lc_filename(tool_object, uri) # tool_object, _ = schema_loader.raw_document_loader.resolve_all(tool_object, uri, checklinks=False) raw_process_reference = schema_loader.raw_process_reference_for_object( tool_object, uri=uri ) cwl_tool = schema_loader.tool( raw_process_reference=raw_process_reference, ) else: cwl_tool = cwl_tool_object if isinstance(cwl_tool, int): raise Exception("Failed to load tool.") raw_tool = cwl_tool.tool # Apply Galaxy hacks to CWL tool representation to bridge semantic differences # between Galaxy and cwltool. _hack_cwl_requirements(cwl_tool) check_requirements(raw_tool) return _cwl_tool_object_to_proxy(cwl_tool, uuid, raw_process_reference=raw_process_reference, tool_path=tool_path)
def _document_load_by_url(loader, url, loadingOptions): # type: (_Loader, str, LoadingOptions) -> Any if url in loadingOptions.idx: return _document_load(loader, loadingOptions.idx[url], url, loadingOptions) text = loadingOptions.fetcher.fetch_text(url) if isinstance(text, bytes): textIO = StringIO(text.decode("utf-8")) else: textIO = StringIO(text) textIO.name = str(url) result = yaml.main.round_trip_load(textIO, preserve_quotes=True) add_lc_filename(result, url) loadingOptions.idx[url] = result loadingOptions = LoadingOptions(copyfrom=loadingOptions, fileuri=url) return _document_load(loader, result, url, loadingOptions)
def validate_js_expressions_strict(tool: dict): """ Run cwltool function to validate JS in an app :param tool: CWL dict """ data = ruamel.yaml.load(ruamel.yaml.dump(tool), ruamel.yaml.RoundTripLoader) schema = get_schema(tool['cwlVersion'])[1].names[tool['class']] add_lc_filename(data, data.get('label', 'input_document')) jshint_options = { "includewarnings": [ "W117", # <VARIABLE> not defined "W104", "W119" # using ES6 features ], "strict": "implied", "esversion": 5 } validate_js_expressions(data, schema, jshint_options)
def run( sourceIO: IO[str], output_dir: str, output_format: str, mainfile: str, pretty: bool ) -> None: """Loop over the provided packed CWL document and split it up.""" source = yaml.main.round_trip_load(sourceIO, preserve_quotes=True) add_lc_filename(source, sourceIO.name) if "$graph" not in source: print("No $graph, so not for us.") return version = source.pop("cwlVersion") def my_represent_none( self: Any, data: Any ) -> Any: # pylint: disable=unused-argument """Force clean representation of 'null'.""" return self.represent_scalar("tag:yaml.org,2002:null", "null") yaml.representer.RoundTripRepresenter.add_representer(type(None), my_represent_none) for entry in source["$graph"]: entry_id = entry.pop("id")[1:] entry["cwlVersion"] = version imports = rewrite(entry, entry_id) if imports: for import_name in imports: rewrite_types(entry, f"#{import_name}", False) if entry_id == "main": if mainfile is None: entry_id = f"unpacked_{os.path.basename(sourceIO.name)}" else: entry_id = mainfile output_file = os.path.join(output_dir, entry_id) if output_format == "json": json_dump(entry, output_file) elif output_format == "yaml": yaml_dump(entry, output_file, pretty)
def load_schema_generate_form(): # At startup, we need to load the metadata schema from the uploader module, so we can make a form for it if os.path.isfile("bh20sequploader/bh20seq-schema.yml"): METADATA_SCHEMA = yaml.round_trip_load( open("bh20sequploader/bh20seq-schema.yml", "r").read()) METADATA_OPTION_DEFINITIONS = yaml.safe_load( open("bh20sequploader/bh20seq-options.yml", "r").read()) else: METADATA_SCHEMA = yaml.round_trip_load( pkg_resources.resource_stream("bh20sequploader", "bh20seq-schema.yml")) METADATA_OPTION_DEFINITIONS = yaml.safe_load( pkg_resources.resource_stream("bh20sequploader", "bh20seq-options.yml")) METADATA_SCHEMA["name"] = "bh20seq-schema.yml" add_lc_filename(METADATA_SCHEMA, "bh20seq-schema.yml") metaschema_names, _metaschema_doc, metaschema_loader = schema_salad.schema.get_metaschema( ) schema_doc, schema_metadata = metaschema_loader.resolve_ref( METADATA_SCHEMA, "") return generate_form(schema_doc, METADATA_OPTION_DEFINITIONS)
def main(args): (document_loader, avsc_names, schema_metadata, metaschema_loader) = schema_salad.schema.load_schema(args.schema) with open(args.metadata) as f: metadata_contents = ruamel.yaml.round_trip_load(f) for metadata_content in metadata_contents: metadata_content["id"] = "http://example.org/id" add_lc_filename(metadata_content, metadata_content["id"]) doc, metadata = schema_salad.schema.load_and_validate( document_loader, avsc_names, metadata_content, True) with open(args.shex) as f: shex = f.read() g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx) validation_result, reason = evaluate(g, shex, doc["id"], "sample_name") if not validation_result: print(reason)
def to_cwl_tool_object(tool_path=None, tool_object=None, persisted_tool=None, strict_cwl_validation=True): schema_loader = _schema_loader(strict_cwl_validation) if tool_path is not None: cwl_tool = schema_loader.tool( path=tool_path ) elif tool_object is not None: # Allow loading tools from YAML... from ruamel import yaml as ryaml import json as_str = json.dumps(tool_object) tool_object = ryaml.round_trip_load(as_str) from schema_salad import sourceline from schema_salad.ref_resolver import file_uri uri = file_uri(os.getcwd()) + "/" sourceline.add_lc_filename(tool_object, uri) tool_object, _ = schema_loader.raw_document_loader.resolve_all(tool_object, uri) raw_process_reference = schema_loader.raw_process_reference_for_object( tool_object, uri=uri ) cwl_tool = schema_loader.tool( raw_process_reference=raw_process_reference, ) else: cwl_tool = ToolProxy.from_persistent_representation(persisted_tool) if isinstance(cwl_tool, int): raise Exception("Failed to load tool.") raw_tool = cwl_tool.tool # Apply Galaxy hacks to CWL tool representation to bridge semantic differences # between Galaxy and cwltool. _hack_cwl_requirements(cwl_tool) check_requirements(raw_tool) return cwl_tool_object_to_proxy(cwl_tool, tool_path=tool_path)
def validate_upload(api, collection, validated_project, fastq_project, fastq_workflow_uuid): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. valid = True if "metadata.yaml" not in col: logging.warn("Upload '%s' missing metadata.yaml", collection["name"]) valid = False else: try: metadata_content = ruamel.yaml.round_trip_load( col.open("metadata.yaml")) metadata_content[ "id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection[ "portable_data_hash"] sample_id = metadata_content["sample"]["sample_id"] add_lc_filename(metadata_content, metadata_content["id"]) valid = qc_metadata(metadata_content) and valid except Exception as e: logging.warn(e) valid = False if not valid: logging.warn("Failed metadata qc") if valid: try: tgt = None paired = { "reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz" } for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): if n not in col: continue with col.open(n, 'rb') as qf: tgt = qc_fasta(qf)[0] if tgt != n and tgt != paired.get(n): logging.info( "Expected %s but magic says it should be %s", n, tgt) valid = False elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid, n, sample_id) return False if tgt is None: valid = False logging.warn( "Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"]) except ValueError as v: valid = False dup = api.collections().list( filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() if dup["items"]: # This exact collection has been uploaded before. valid = False logging.warn("Upload '%s' is duplicate" % collection["name"]) if valid: logging.info("Added '%s' to validated sequences" % collection["name"]) # Move it to the "validated" project to be included in the next analysis api.collections().update( uuid=collection["uuid"], body={ "owner_uuid": validated_project, "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())) }).execute() else: # It is invalid, delete it. logging.warn("Suggest deleting '%s'" % collection["name"]) #api.collections().delete(uuid=collection["uuid"]).execute() return valid
(document_loader, avsc_names, schema_metadata, metaschema_loader) = schema_salad.schema.load_schema(metadataSchema) for item in validated: pdh = item["portable_data_hash"] uuid = item["uuid"] try: subject = "http://covid19.genenetwork.org/resource/%s" % uuid with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col: with col.open("metadata.yaml", "rt") as md: metadata_content = ruamel.yaml.round_trip_load(md) metadata_content["id"] = subject add_lc_filename(metadata_content, metadata_content["id"]) doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False) g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx) with col.open("sequence.fasta", "rt") as fa: label = fa.readline().strip() merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/original_fasta_label> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"'))) merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/collection_pdh> \"%s\" .\n" % (subject, pdh)) merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/collection_version> \"%s\" .\n" % (subject, item["version"])) skip = (subject in blacklist or label[1:] in blacklist) if skip: merged_metadata.write("<%s> <http://biohackathon.org/bh20-seq-schema/excluded_from_graph> \"true\"^^<http://www.w3.org/2001/XMLSchema#boolean> .\n" % subject) if not skip: relabeled_fasta.write(">"+subject+"\n") data = fa.read(8096) while data:
def run(args: argparse.Namespace) -> int: """Primary processing loop.""" imports: Set[str] = set() for document in args.inputs: _logger.info("Processing %s.", document) with open(document) as doc_handle: result = yaml.main.round_trip_load(doc_handle, preserve_quotes=True) add_lc_filename(result, document) version = result.get("cwlVersion", None) if version in ("draft-3", "cwl:draft-3", "v1.0", "v1.1"): result = cwlupgrader.upgrade_document(result, False, False, args.dir, imports) else: _logger.error( "Sorry, %s in %s is not a supported CWL version by this tool.", (version, document), ) return -1 uri = Path(document).resolve().as_uri() if not args.no_expression_refactoring: refactored, _ = cwl_v1_2_expression_refactor.traverse( load_document_by_yaml(result, uri), not args.etools, False, args.skip_some1, args.skip_some2, ) if not isinstance(refactored, MutableSequence): result = save( refactored, base_url=refactored.loadingOptions.fileuri if refactored.loadingOptions.fileuri else "", ) # ^^ Setting the base_url and keeping the default value # for relative_uris=True means that the IDs in the generated # JSON/YAML are kept clean of the path to the input document else: result = [ save(result_item, base_url=result_item.loadingOptions.fileuri) for result_item in refactored ] if "$graph" in result: packed = result else: with tempfile.TemporaryDirectory() as tmpdirname: path = Path(tmpdirname) / Path(document).name with open(path, "w") as handle: yaml.main.round_trip_dump(result, handle) # TODO replace the cwltool based packing with a parser_v1_2 based packer runtimeContext = RuntimeContext() loadingContext = LoadingContext() use_standard_schema("v1.2") # loadingContext.construct_tool_object = workflow.default_make_tool # loadingContext.resolver = tool_resolver loadingContext.do_update = False uri, tool_file_uri = resolve_tool_uri( str(path), resolver=loadingContext.resolver, fetcher_constructor=loadingContext.fetcher_constructor, ) loadingContext, workflowobj, uri = fetch_document( uri, loadingContext) loadingContext, uri = resolve_and_validate_document( loadingContext, workflowobj, uri, preprocess_only=True, skip_schemas=True, ) packed = print_pack(loadingContext, uri) output = Path(args.dir) / Path(document).name with open(output, "w", encoding="utf-8") as output_filehandle: output_filehandle.write(packed) return 0
def validate_upload(self, collection, revalidate): if not revalidate and collection["properties"].get("status") in ( "validated", "rejected"): return False with arvados.collection.CollectionReader( collection["uuid"], api_client=self.api, keep_client=self.keepclient) as col: # validate the collection here. Check metadata, etc. logging.info("Validating upload '%s' (%s)" % (collection["name"], collection["uuid"])) errors = [] if collection["owner_uuid"] != self.validated_project: dup = self.api.collections().list(filters=[[ "owner_uuid", "=", self.validated_project ], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() if dup["items"]: # This exact collection has been uploaded before. errors.append("Duplicate of %s" % ([d["uuid"] for d in dup["items"]])) if not errors: if "metadata.yaml" not in col: errors.append("%s missing metadata.yaml" % collection["name"]) else: try: with col.open("metadata.yaml") as md: metadata_content = ruamel.yaml.round_trip_load(md) metadata_content[ "id"] = "http://covid19.genenetwork.org/resource/%s" % collection[ "uuid"] sample_id = metadata_content["sample"]["sample_id"] add_lc_filename(metadata_content, metadata_content["id"]) valid = qc_metadata(metadata_content) if not valid: errors.append("Failed metadata qc") except Exception as e: errors.append(str(e)) existing = self.api.collections().list(filters=[[ "owner_uuid", "=", self.validated_project ], ["properties.sequence_label", "=", sample_id]]).execute() if not errors: try: tgt = None paired = { "reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz" } for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): if n not in col: continue with col.open(n, 'rb') as qf: tgt, seqlabel, seq_type = qc_fasta(qf) if tgt != n and tgt != paired.get(n): errors.append( "Expected %s but magic says it should be %s" % (n, tgt)) elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): self.start_fastq_to_fasta( collection, n, sample_id) return False # If it is a FASTA if sample_id != seqlabel: errors.append( "Expected sample_id == seqlabel, but %s != %s" % (sample_id, seqlabel)) if tgt is None and len(existing["items"]) == 0: errors.append( "Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq" % collection["name"]) except Exception as v: errors.append(str(v)) if errors: # It is invalid logging.warn("'%s' (%s) has validation errors: %s" % (collection["name"], collection["uuid"], "\n".join(errors))) collection["properties"]["status"] = "rejected" collection["properties"]["errors"] = errors self.api.collections().update(uuid=collection["uuid"], body={ "properties": collection["properties"] }).execute() return False update_from = None if existing["items"]: # "collection" is the newly uploaded one we're looking at update_from = collection collection = existing["items"][0] collection["properties"] = update_from["properties"] if "errors" in collection["properties"]: del collection["properties"]["errors"] collection["properties"]["status"] = "validated" collection["properties"]["sequence_label"] = sample_id if update_from: with arvados.collection.Collection( collection["uuid"], api_client=self.api, keep_client=self.keepclient) as update_existing_col: update_existing_col.copy("metadata.yaml", "metadata.yaml", source_collection=col, overwrite=True) update_existing_col.save( properties=collection["properties"]) self.api.collections().delete( uuid=update_from["uuid"]).execute() logging.info("Updated '%s' in validated sequences" % collection["name"]) else: # Move it to the "validated" project to be included in the next analysis self.api.collections().update( uuid=collection["uuid"], body={ "owner_uuid": self.validated_project, "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())), "properties": collection["properties"] }).execute() logging.info("Added '%s' to validated sequences" % collection["name"]) return True