Beispiel #1
0
	def generate_workflow(self):
		"Generate a workflow (DAX, config files, and replica catalog)"
		ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
		dax = ADAG("mgrast-prod-%s" % ts)
		
		# These are all the global input files for the workflow
		metagenome = File(self.mgfile)
		self.add_replica(self.mgfile, os.path.abspath(self.mgfile))

		# QC job
		qcJob = Job("wrapper-qc", node_label="wrapper-qc")

		qcJob.addArguments("-input", self.mgfile)
		qcJob.addArguments("-format", self.file_format)
		qcJob.addArguments("-out_prefix", "075")
		qcJob.addArguments("-assembled", self.assembled)
		qcJob.addArguments("-filter_options", self.filter_options)
		qcJob.addArguments("-proc", "8")

		qcJob.uses(metagenome, link=Link.INPUT)
		qcJob.uses("075.assembly.coverage", link=Link.OUTPUT, transfer=False)
		qcJob.uses("075.qc.stats", link=Link.OUTPUT, transfer=False)
		qcJob.uses("075.upload.stats", link=Link.OUTPUT, transfer=False)

		qcJob.profile("globus", "maxwalltime", "60")
		qcJob.profile("globus", "hostcount", "8")
		qcJob.profile("globus", "count", "8")
		dax.addJob(qcJob)

		# Preprocess Job
		preprocessJob = Job("wrapper-preprocess", node_label="wrapper-preprocess")
		preprocessJob.addArguments("-input", self.mgfile)
		preprocessJob.addArguments("-format", self.file_format)
		preprocessJob.addArguments("-out_prefix", "100.preprocess")
		preprocessJob.addArguments("-filter_options", self.filter_options)
		
		preprocessJob.uses(metagenome, link=Link.INPUT)
		preprocessJob.uses("100.preprocess.passed.fna", link=Link.OUTPUT, transfer=False)
		preprocessJob.uses("100.preprocess.removed.fna", link=Link.OUTPUT, transfer=False)

		preprocessJob.profile("globus", "maxwalltime", "20")
		dax.addJob(preprocessJob)

		# Dereplicate Job
		dereplicateJob = Job("wrapper-dereplicate", node_label="wrapper-dereplicate")
		dereplicateJob.addArguments("-input=100.preprocess.passed.fna")
		dereplicateJob.addArguments("-out_prefix=150.dereplication")
		dereplicateJob.addArguments("-prefix_length=%s" % self.prefix_length)
		dereplicateJob.addArguments("-dereplicate=%s" % self.dereplicate)
		dereplicateJob.addArguments("-memory=10")

		dereplicateJob.uses("100.preprocess.passed.fna", link=Link.INPUT)
		dereplicateJob.uses("150.dereplication.passed.fna", link=Link.OUTPUT, transfer=False)
		dereplicateJob.uses("150.dereplication.removed.fna", link=Link.OUTPUT, transfer=False)

		dereplicateJob.profile("globus", "maxwalltime", "10")
		dax.addJob(dereplicateJob)
		dax.depends(dereplicateJob, preprocessJob)

		# Bowtie Screen Job
		bowtieJob = Job("wrapper-bowtie-screen", node_label="wrapper-bowtie-screen")
		bowtieJob.addArguments("-input=150.dereplication.passed.fna")
		bowtieJob.addArguments("-output=299.screen.passed.fna")
		bowtieJob.addArguments("-index=%s" % self.screen_indexes)
		bowtieJob.addArguments("-bowtie=%s" % self.bowtie)
		bowtieJob.addArguments("-proc=8")

		bowtieJob.uses("150.dereplication.passed.fna", link=Link.INPUT)
		bowtieJob.uses("299.screen.passed.fna", link=Link.OUTPUT, transfer=False)

		bowtieJob.profile("globus", "maxwalltime", "30")
		bowtieJob.profile("globus", "hostcount", "8")
		bowtieJob.profile("globus", "count", "8")
		dax.addJob(bowtieJob)
		dax.depends(bowtieJob, dereplicateJob)

		# Genecalling Job
		geneJob = Job("wrapper-genecalling", node_label="wrapper-genecalling")
		geneJob.addArguments("-input=299.screen.passed.fna")
		geneJob.addArguments("-out_prefix=350.genecalling.coding")
		geneJob.addArguments("-type=%s" % self.fgs_type)
		geneJob.addArguments("-size=100")
		geneJob.addArguments("-proc=8")

		geneJob.uses("299.screen.passed.fna", link=Link.INPUT)
		geneJob.uses("350.genecalling.coding.faa", link=Link.OUTPUT, transfer=False)
		geneJob.uses("350.genecalling.coding.fna", link=Link.OUTPUT, transfer=False)

		geneJob.profile("globus", "maxwalltime", "30")
		geneJob.profile("globus", "hostcount", "8")
		geneJob.profile("globus", "count", "8")
		dax.addJob(geneJob)
		dax.depends(geneJob, bowtieJob)

		# Cluster (Genecalling) Job
		cluster1Job = Job("wrapper-cluster", node_label="wrapper-cluster")
		cluster1Job.addArguments("-input=350.genecalling.coding.faa")
		cluster1Job.addArguments("-out_prefix=550.cluster")
		cluster1Job.addArguments("-aa")
		cluster1Job.addArguments("-pid=%s" % self.aa_pid)
		cluster1Job.addArguments("-memory=20")

		cluster1Job.uses("350.genecalling.coding.faa", link=Link.INPUT)
		cluster1Job.uses("550.cluster.aa%s.faa" % self.aa_pid, link=Link.OUTPUT, transfer=False)
		cluster1Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.OUTPUT, transfer=False)
		
		cluster1Job.profile("globus", "maxwalltime", "10")
		dax.addJob(cluster1Job)
		dax.depends(cluster1Job, geneJob)

		# Blat_prot Job
		blatprotJob = Job("wrapper-blat-prot", node_label="wrapper-blat-prot")
		blatprotJob.addArguments("--input=550.cluster.aa%s.faa" % self.aa_pid)
		blatprotJob.addArguments("--output=650.superblat.sims")

		blatprotJob.uses("550.cluster.aa%s.faa" % self.aa_pid, link=Link.INPUT)
		blatprotJob.uses("650.superblat.sims", link=Link.OUTPUT, transfer=False)
		
		blatprotJob.profile("globus", "maxwalltime", "2880")
                blatprotJob.profile("globus", "hostcount", "24")
                blatprotJob.profile("globus", "count", "24")
		dax.addJob(blatprotJob)
		dax.depends(blatprotJob, cluster1Job)

		# Annotate Sims (Blat Prod) Job
		annotatesims1Job = Job("wrapper-annotate-sims", node_label="wrapper-annotate-sims")
		annotatesims1Job.addArguments("-input=650.superblat.sims")
		annotatesims1Job.addArguments("-out_prefix=650")
		annotatesims1Job.addArguments("-aa")
		annotatesims1Job.addArguments("-ach_ver=%s" % self.ach_annotation_ver)
		annotatesims1Job.addArguments("-ann_file=m5nr_v1.bdb")

		annotatesims1Job.uses("650.superblat.sims", link=Link.INPUT)
		annotatesims1Job.uses("650.aa.sims.filter", link=Link.OUTPUT, transfer=False)
		annotatesims1Job.uses("650.aa.expand.protein", link=Link.OUTPUT, transfer=False)
		annotatesims1Job.uses("650.aa.expand.lca", link=Link.OUTPUT, transfer=False)
		annotatesims1Job.uses("650.aa.expand.ontology", link=Link.OUTPUT, transfer=False)
		
		annotatesims1Job.profile("globus", "maxwalltime", "720")
		dax.addJob(annotatesims1Job)
		dax.depends(annotatesims1Job, blatprotJob)

		# Search RNA Job
		searchJob = Job("wrapper-search-rna", node_label="wrapper-search-rna")
		searchJob.addArguments("-input=100.preprocess.passed.fna")
		searchJob.addArguments("-output=425.search.rna.fna")
		searchJob.addArguments("-rna_nr=%s" % self.m5rna_clust)
		searchJob.addArguments("-size=100")
		searchJob.addArguments("-proc=8")

		searchJob.uses("100.preprocess.passed.fna", link=Link.INPUT)
		searchJob.uses("425.search.rna.fna", link=Link.OUTPUT, transfer=False)

                searchJob.profile("globus", "maxwalltime", "120")
                searchJob.profile("globus", "hostcount", "8")
                searchJob.profile("globus", "count", "8")
                dax.addJob(searchJob)
		dax.depends(searchJob, preprocessJob)

		# CLuster (Search RNA) Job
		cluster2Job = Job("wrapper-cluster", node_label="wrapper-cluster")
                cluster2Job.addArguments("-input=425.search.rna.fna")
                cluster2Job.addArguments("-out_prefix=440.cluster")
                cluster2Job.addArguments("-rna")
                cluster2Job.addArguments("-pid=%s" % self.rna_pid)
                cluster2Job.addArguments("-memory=20")

                cluster2Job.uses("425.search.rna.fna", link=Link.INPUT)
                cluster2Job.uses("440.cluster.rna%s.fna" % self.rna_pid, link=Link.OUTPUT, transfer=False)
                cluster2Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.OUTPUT, transfer=False)

                cluster2Job.profile("globus", "maxwalltime", "30")
                dax.addJob(cluster2Job)
		dax.depends(cluster2Job, searchJob)

		# Blat_rna Job
		blatrnaJob = Job("wrapper-blat-rna", node_label="wrapper-blat-rna")
		blatrnaJob.addArguments("--input=440.cluster.rna%s.fna" % self.rna_pid)
		blatrnaJob.addArguments("-rna_nr=m5rna")
		blatrnaJob.addArguments("--output=450.rna.sims")
		blatrnaJob.addArguments("-assembled=%s" % self.assembled)

		blatrnaJob.uses("440.cluster.rna%s.fna" % self.rna_pid, link=Link.INPUT)
		blatrnaJob.uses("450.rna.sims", link=Link.OUTPUT, transfer=False)
		
		blatrnaJob.profile("globus", "maxwalltime", "20")
		dax.addJob(blatrnaJob)
		dax.depends(blatrnaJob, cluster2Job)

		# Annotate Sims (Blat RNA) Job
		annotatesims2Job = Job("wrapper-annotate-sims", node_label="wrapper-annotate-sims")
		annotatesims2Job.addArguments("-input=450.rna.sims")
		annotatesims2Job.addArguments("-out_prefix=450")
		annotatesims2Job.addArguments("-rna")
		annotatesims2Job.addArguments("-ach_ver=%s" % self.ach_annotation_ver)
		annotatesims2Job.addArguments("-ann_file=m5nr_v1.bdb")

		annotatesims2Job.uses("450.rna.sims", link=Link.INPUT)
		annotatesims2Job.uses("450.rna.sims.filter", link=Link.OUTPUT, transfer=False)
		annotatesims2Job.uses("450.rna.expand.rna", link=Link.OUTPUT, transfer=False)
		annotatesims2Job.uses("450.rna.expand.lca", link=Link.OUTPUT, transfer=False)

		annotatesims2Job.profile("globus", "maxwalltime", "30")
		dax.addJob(annotatesims2Job)
		dax.depends(annotatesims2Job, blatrnaJob)

		# Index Sim Seq Job
		indexJob = Job("wrapper-index", node_label="wrapper-index")
		indexJob.addArguments("-in_seqs=350.genecalling.coding.fna")
		indexJob.addArguments("-in_seqs=425.search.rna.fna")
		indexJob.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid)
		indexJob.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid)
		indexJob.addArguments("-in_sims=650.aa.sims.filter")
		indexJob.addArguments("-in_sims=450.rna.sims.filter")
		indexJob.addArguments("-output=700.annotation.sims.filter.seq")
		indexJob.addArguments("-ach_ver=%s" % self.ach_annotation_ver)
		indexJob.addArguments("-memory=10")
		indexJob.addArguments("-ann_file=m5nr_v1.bdb")

		indexJob.uses("350.genecalling.coding.fna", link=Link.INPUT)
		indexJob.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT)
		indexJob.uses("650.aa.sims.filter", link=Link.INPUT)
		indexJob.uses("425.search.rna.fna", link=Link.INPUT)
		indexJob.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT)
		indexJob.uses("450.rna.sims.filter", link=Link.INPUT)
		indexJob.uses("700.annotation.sims.filter.seq", link=Link.OUTPUT, transfer=False)
		indexJob.uses("700.annotation.sims.filter.seq.index", link=Link.OUTPUT, transfer=False)

		indexJob.profile("globus", "maxwalltime", "120")
                dax.addJob(indexJob)
                dax.depends(indexJob, geneJob)
                dax.depends(indexJob, cluster1Job)
                dax.depends(indexJob, cluster2Job)
                dax.depends(indexJob, searchJob)
                dax.depends(indexJob, annotatesims1Job)

		# Annotate Summary Job (13)
		summary13Job = Job("wrapper-summary", node_label="wrapper-summary")
		summary13Job.addArguments("-job=1")
		summary13Job.addArguments("-in_expand=650.aa.expand.protein")
		summary13Job.addArguments("-in_expand=450.rna.expand.rna")
		summary13Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid)
		summary13Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid)
		summary13Job.addArguments("-in_assemb=075.assembly.coverage")
		summary13Job.addArguments("-in_index=700.annotation.sims.filter.seq.index")
		summary13Job.addArguments("-output=700.annotation.md5.summary")
		summary13Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver)
		summary13Job.addArguments("-type=md5")

		summary13Job.uses("075.assembly.coverage", link=Link.INPUT)
		summary13Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT)
		summary13Job.uses("650.aa.expand.protein", link=Link.INPUT)
		summary13Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT)
		summary13Job.uses("450.rna.expand.rna", link=Link.INPUT)
		summary13Job.uses("700.annotation.sims.filter.seq.index", link=Link.INPUT)
		summary13Job.uses("700.annotation.md5.summary", link=Link.OUTPUT, transfer=True)

		summary13Job.profile("globus", "maxwalltime", "30")
                dax.addJob(summary13Job)
                dax.depends(summary13Job, qcJob)
                dax.depends(summary13Job, cluster1Job)
                dax.depends(summary13Job, cluster2Job)
                dax.depends(summary13Job, indexJob)
                dax.depends(summary13Job, annotatesims1Job)
                dax.depends(summary13Job, annotatesims2Job)

		# Annotate Summary Job (14)
		summary14Job = Job("wrapper-summary", node_label="wrapper-summary")
		summary14Job.addArguments("-job=1")
		summary14Job.addArguments("-in_expand=650.aa.expand.protein")
		summary14Job.addArguments("-in_expand=450.rna.expand.rna")
		summary14Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid)
		summary14Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid)
		summary14Job.addArguments("-in_assemb=075.assembly.coverage")
		summary14Job.addArguments("-output=700.annotation.function.summary")
		summary14Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver)
		summary14Job.addArguments("-type=function")

		summary14Job.uses("075.assembly.coverage", link=Link.INPUT)
		summary14Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT)
		summary14Job.uses("650.aa.expand.protein", link=Link.INPUT)
		summary14Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT)
		summary14Job.uses("450.rna.expand.rna", link=Link.INPUT)
		summary14Job.uses("700.annotation.function.summary", link=Link.OUTPUT, transfer=True)

		summary14Job.profile("globus", "maxwalltime", "30")
                dax.addJob(summary14Job)
                dax.depends(summary14Job, qcJob)
                dax.depends(summary14Job, cluster1Job)
                dax.depends(summary14Job, cluster2Job)
                dax.depends(summary14Job, annotatesims1Job)
                dax.depends(summary14Job, annotatesims2Job)

		# Annotate Summary Job (15)
		summary15Job = Job("wrapper-summary", node_label="wrapper-summary")
		summary15Job.addArguments("-job=1")
		summary15Job.addArguments("-in_expand=650.aa.expand.protein")
		summary15Job.addArguments("-in_expand=450.rna.expand.rna")
		summary15Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid)
		summary15Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid)
		summary15Job.addArguments("-in_assemb=075.assembly.coverage")
		summary15Job.addArguments("-output=700.annotation.organism.summary")
		summary15Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver)
		summary15Job.addArguments("-type=organism")

		summary15Job.uses("075.assembly.coverage", link=Link.INPUT)
		summary15Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT)
		summary15Job.uses("650.aa.expand.protein", link=Link.INPUT)
		summary15Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT)
		summary15Job.uses("450.rna.expand.rna", link=Link.INPUT)
		summary15Job.uses("700.annotation.organism.summary", link=Link.OUTPUT, transfer=True)

		summary15Job.profile("globus", "maxwalltime", "30")
                dax.addJob(summary15Job)
                dax.depends(summary15Job, qcJob)
                dax.depends(summary15Job, cluster1Job)
                dax.depends(summary15Job, cluster2Job)
                dax.depends(summary15Job, annotatesims1Job)
                dax.depends(summary15Job, annotatesims2Job)

		# Annotate Summary Job (16)
		summary16Job = Job("wrapper-summary", node_label="wrapper-summary")
		summary16Job.addArguments("-job=1")
		summary16Job.addArguments("-in_expand=650.aa.expand.lca")
		summary16Job.addArguments("-in_expand=450.rna.expand.lca")
		summary16Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid)
		summary16Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid)
		summary16Job.addArguments("-in_assemb=075.assembly.coverage")
		summary16Job.addArguments("-output=700.annotation.lca.summary")
		summary16Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver)
		summary16Job.addArguments("-type=lca")

		summary16Job.uses("075.assembly.coverage", link=Link.INPUT)
		summary16Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT)
		summary16Job.uses("650.aa.expand.lca", link=Link.INPUT)
		summary16Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT)
		summary16Job.uses("450.rna.expand.lca", link=Link.INPUT)
		summary16Job.uses("700.annotation.lca.summary", link=Link.OUTPUT, transfer=True)

		summary16Job.profile("globus", "maxwalltime", "30")
                dax.addJob(summary16Job)
                dax.depends(summary16Job, qcJob)
                dax.depends(summary16Job, cluster1Job)
                dax.depends(summary16Job, cluster2Job)
                dax.depends(summary16Job, annotatesims1Job)
                dax.depends(summary16Job, annotatesims2Job)

		# Annotate Summary Job (17)
		summary17Job = Job("wrapper-summary", node_label="wrapper-summary")
		summary17Job.addArguments("-job=1")
		summary17Job.addArguments("-in_expand=650.aa.expand.ontology")
		summary17Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid)
		summary17Job.addArguments("-in_assemb=075.assembly.coverage")
		summary17Job.addArguments("-output=700.annotation.ontology.summary")
		summary17Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver)
		summary17Job.addArguments("-type=ontology")

		summary17Job.uses("075.assembly.coverage", link=Link.INPUT)
		summary17Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT)
		summary17Job.uses("650.aa.expand.ontology", link=Link.INPUT)
		summary17Job.uses("700.annotation.ontology.summary", link=Link.OUTPUT, transfer=True)

		summary17Job.profile("globus", "maxwalltime", "30")
                dax.addJob(summary17Job)
                dax.depends(summary17Job, qcJob)
                dax.depends(summary17Job, cluster1Job)
                dax.depends(summary17Job, annotatesims1Job)

		# Annotate Summary Job (18)
		summary18Job = Job("wrapper-summary", node_label="wrapper-summary")
		summary18Job.addArguments("-job=1")
		summary18Job.addArguments("-in_expand=650.aa.expand.protein")
		summary18Job.addArguments("-in_expand=450.rna.expand.rna")
		summary18Job.addArguments("-in_maps=550.cluster.aa%s.mapping" % self.aa_pid)
		summary18Job.addArguments("-in_maps=440.cluster.rna%s.mapping" % self.rna_pid)
		summary18Job.addArguments("-in_assemb=075.assembly.coverage")
		summary18Job.addArguments("-output=700.annotation.source.stats")
		summary18Job.addArguments("-nr_ver=%s" % self.ach_annotation_ver)
		summary18Job.addArguments("-type=source")

		summary18Job.uses("075.assembly.coverage", link=Link.INPUT)
		summary18Job.uses("550.cluster.aa%s.mapping" % self.aa_pid, link=Link.INPUT)
		summary18Job.uses("650.aa.expand.protein", link=Link.INPUT)
		summary18Job.uses("440.cluster.rna%s.mapping" % self.rna_pid, link=Link.INPUT)
		summary18Job.uses("450.rna.expand.rna", link=Link.INPUT)
		summary18Job.uses("700.annotation.source.stats", link=Link.OUTPUT, transfer=True)

		summary18Job.profile("globus", "maxwalltime", "30")
                dax.addJob(summary18Job)
                dax.depends(summary18Job, qcJob)
                dax.depends(summary18Job, cluster1Job)
                dax.depends(summary18Job, cluster2Job)
                dax.depends(summary18Job, annotatesims1Job)
                dax.depends(summary18Job, annotatesims2Job)

	
		# Write the DAX file
		dax.writeXMLFile(self.daxfile)

		# Generate the replica catalog
		self.generate_replica_catalog()
    def run_python_on_parameters(
        self,
        job_name: Locator,
        python_module: Any,
        parameters: Union[Parameters, Dict[str, Any]],
        *,
        depends_on,
        resource_request: Optional[ResourceRequest] = None,
        override_conda_config: Optional[CondaConfiguration] = None,
        category: Optional[str] = None,
    ) -> DependencyNode:
        """
        Schedule a job to run the given *python_module* on the given *parameters*.

        If this job requires other jobs to be executed first,
        include them in *depends_on*.

        This method returns a `DependencyNode` which can be used in *depends_on*
        for future jobs.
        """
        job_dir = self.directory_for(job_name)
        ckpt_name = job_name / "___ckpt"
        checkpoint_path = job_dir / "___ckpt"

        depends_on = _canonicalize_depends_on(depends_on)
        if isinstance(python_module, str):
            fully_qualified_module_name = python_module
        else:
            fully_qualified_module_name = fully_qualified_name(python_module)

        # allow users to specify the parameters as a dict for convenience
        if not isinstance(parameters, Parameters):
            parameters = Parameters.from_mapping(parameters)

        # If we've already scheduled this identical job,
        # then don't schedule it again.
        params_sink = CharSink.to_string()
        YAMLParametersWriter().write(parameters, params_sink)
        signature = (fully_qualified_module_name,
                     params_sink.last_string_written)
        if signature in self._signature_to_job:
            logging.info("Job %s recognized as a duplicate", job_name)
            return self._signature_to_job[signature]

        script_path = job_dir / "___run.sh"
        stdout_path = parameters.string(
            "logfile", default=str((job_dir / "___stdout.log").absolute()))
        self._conda_script_generator.write_shell_script_to(
            entry_point_name=fully_qualified_module_name,
            parameters=parameters,
            working_directory=job_dir,
            script_path=script_path,
            params_path=job_dir / "____params.params",
            stdout_file=stdout_path,
            ckpt_path=checkpoint_path,
            override_conda_config=override_conda_config,
        )
        script_executable = Executable(
            namespace=self._namespace,
            name=str(job_name).replace("/", "_"),
            version="4.0",
            os="linux",
            arch="x86_64",
        )
        script_executable.addPFN(
            path_to_pfn(script_path, site=self._default_site))
        if not self._job_graph.hasExecutable(script_executable):
            self._job_graph.addExecutable(script_executable)
        job = Job(script_executable)
        self._job_graph.addJob(job)
        for parent_dependency in depends_on:
            if parent_dependency.job:
                self._job_graph.depends(job, parent_dependency.job)
            for out_file in parent_dependency.output_files:
                job.uses(out_file, link=Link.INPUT)

        if resource_request is not None:
            resource_request = self.default_resource_request.unify(
                resource_request)
        else:
            resource_request = self.default_resource_request

        if category:
            job.profile(Namespace.DAGMAN, "category", category)
        resource_request.apply_to_job(job,
                                      job_name=self._job_name_for(job_name))

        # Handle Output Files
        # This is currently only handled as the checkpoint file
        # See: https://github.com/isi-vista/vista-pegasus-wrapper/issues/25
        checkpoint_pegasus_file = path_to_pegasus_file(checkpoint_path,
                                                       site=self._default_site,
                                                       name=f"{ckpt_name}")

        if checkpoint_pegasus_file not in self._added_files:
            self._job_graph.addFile(checkpoint_pegasus_file)
            self._added_files.add(checkpoint_pegasus_file)

        # If the checkpoint file already exists, we want to add it to the replica catalog
        # so that we don't run the job corresponding to the checkpoint file again
        if checkpoint_path.exists():
            with self._replica_catalog.open("a+") as handle:
                handle.write(
                    f"{ckpt_name} file://{checkpoint_path} site={self._default_site}\n"
                )

        job.uses(checkpoint_pegasus_file, link=Link.OUTPUT, transfer=True)

        dependency_node = DependencyNode.from_job(
            job, output_files=[checkpoint_pegasus_file])
        self._signature_to_job[signature] = dependency_node

        logging.info("Scheduled Python job %s", job_name)
        return dependency_node
Beispiel #3
0
    def generate_dax(self):
        "Generate a workflow (DAX, config files, and replica catalog)"
        ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
        dax = ADAG("refinement-%s" % ts)

        # These are all the global input files for the workflow
        coordinates = File(self.coordinates)
        parameters = File(self.parameters)
        extended_system = File(self.extended_system)
        topfile = File(self.topfile)
        sassena_db = File(self.sassena_db)
        incoherent_db = File(self.incoherent_db)
        coherent_db = File(self.coherent_db)

        # This job untars the sassena db and makes it available to the other
        # jobs in the workflow
        untarjob = Job("tar", node_label="untar")

        if self.is_synthetic_workflow:
            untarjob.addArguments("-p", "-xzvf", sassena_db.name)
            untarjob.addArguments("-a", "tar")

            for output_file in [ "incoherent_db", "coherent_db" ]:
                untarjob.addArguments(self.keg_params.output_file("tar", output_file, eval(output_file).name))

            self.keg_params.add_keg_params(untarjob)
        else:
            untarjob.addArguments("-xzvf", sassena_db)

        untarjob.uses(sassena_db, link=Link.INPUT)
        untarjob.uses(incoherent_db, link=Link.OUTPUT, transfer=False)
        untarjob.uses(coherent_db, link=Link.OUTPUT, transfer=False)

        untarjob.profile("globus", "jobtype", "single")
        untarjob.profile("globus", "maxwalltime", "1")
        untarjob.profile("globus", "count", "1")

        dax.addJob(untarjob)

        # For each charge that was listed in the config file
        for charge in self.charges:

            structure = "Q%s.psf" % charge

            # Equilibrate files
            eq_conf = File("equilibrate_%s.conf" % charge)
            eq_coord = File("equilibrate_%s.restart.coord" % charge)
            eq_xsc = File("equilibrate_%s.restart.xsc" % charge)
            eq_vel = File("equilibrate_%s.restart.vel" % charge)

            # Production files
            prod_conf = File("production_%s.conf" % charge)
            prod_dcd = File("production_%s.dcd" % charge)

            # Ptraj files
            ptraj_conf = File("ptraj_%s.conf" % charge)
            ptraj_fit = File("ptraj_%s.fit" % charge)
            ptraj_dcd = File("ptraj_%s.dcd" % charge)

            # Sassena incoherent files
            incoherent_conf = File("sassenaInc_%s.xml" % charge)
            fqt_incoherent = File("fqt_inc_%s.hd5" % charge)

            # Sassena coherent files
            coherent_conf = File("sassenaCoh_%s.xml" % charge)
            fqt_coherent = File("fqt_coh_%s.hd5" % charge)

            # Generate psf and configuration files for this charge pipeline
            self.generate_psf(charge)
            self.generate_eq_conf(charge, structure)
            self.generate_prod_conf(charge, structure)
            self.generate_ptraj_conf(charge)
            self.generate_incoherent_conf(charge)
            self.generate_coherent_conf(charge)

            # Equilibrate job
            eqjob = Job("namd", node_label="namd_eq_%s" % charge)
            if self.is_synthetic_workflow:
                eqjob.addArguments("-p", eq_conf)
                eqjob.addArguments("-a", "namd_eq_%s" % charge)
                eqjob.addArguments("-i", eq_conf.name, structure, coordinates.name,
                    parameters.name, extended_system.name)

                task_label = "namd-eq"

                for output_file in [ "eq_coord", "eq_xsc", "eq_vel" ]:
                    eqjob.addArguments(self.keg_params.output_file(task_label, output_file, eval(output_file).name))

                self.keg_params.add_keg_params(eqjob, task_label)
            else:
                eqjob.addArguments(eq_conf)

            eqjob.uses(eq_conf, link=Link.INPUT)
            eqjob.uses(structure, link=Link.INPUT)
            eqjob.uses(coordinates, link=Link.INPUT)
            eqjob.uses(parameters, link=Link.INPUT)
            eqjob.uses(extended_system, link=Link.INPUT)
            eqjob.uses(eq_coord, link=Link.OUTPUT, transfer=False)
            eqjob.uses(eq_xsc, link=Link.OUTPUT, transfer=False)
            eqjob.uses(eq_vel, link=Link.OUTPUT, transfer=False)
            if self.is_synthetic_workflow:
                eqjob.profile("globus", "jobtype", "mpi")
                eqjob.profile("globus", "maxwalltime", "1")
                eqjob.profile("globus", "count", "8")
            else:
                eqjob.profile("globus", "jobtype", "mpi")
                eqjob.profile("globus", "maxwalltime", self.getconf("equilibrate_maxwalltime"))
                eqjob.profile("globus", "count", self.getconf("equilibrate_cores"))
            dax.addJob(eqjob)

            # Production job
            prodjob = Job("namd", node_label="namd_prod_%s" % charge)

            if self.is_synthetic_workflow:
                prodjob.addArguments("-p", prod_conf)
                prodjob.addArguments("-a", "namd_prod_%s" % charge)
                prodjob.addArguments("-i", prod_conf.name, structure, coordinates.name,
                    parameters.name, eq_coord.name, eq_xsc.name, eq_vel.name)

                task_label = "namd-prod"
                prodjob.addArguments(self.keg_params.output_file(task_label, "prod_dcd", prod_dcd.name))
                self.keg_params.add_keg_params(prodjob, task_label)
            else:
                prodjob.addArguments(prod_conf)

            prodjob.uses(prod_conf, link=Link.INPUT)
            prodjob.uses(structure, link=Link.INPUT)
            prodjob.uses(coordinates, link=Link.INPUT)
            prodjob.uses(parameters, link=Link.INPUT)
            prodjob.uses(eq_coord, link=Link.INPUT)
            prodjob.uses(eq_xsc, link=Link.INPUT)
            prodjob.uses(eq_vel, link=Link.INPUT)
            prodjob.uses(prod_dcd, link=Link.OUTPUT, transfer=True)

            if self.is_synthetic_workflow:
                prodjob.profile("globus", "jobtype", "mpi")
                prodjob.profile("globus", "maxwalltime", "6")
                prodjob.profile("globus", "count", "8")
            else:
                prodjob.profile("globus", "jobtype", "mpi")
                prodjob.profile("globus", "maxwalltime", self.getconf("production_maxwalltime"))
                prodjob.profile("globus", "count", self.getconf("production_cores"))

            dax.addJob(prodjob)
            dax.depends(prodjob, eqjob)

            # ptraj job
            ptrajjob = Job(namespace="amber", name="ptraj", node_label="amber_ptraj_%s" % charge)

            if self.is_synthetic_workflow:
                ptrajjob.addArguments("-p", topfile)
                ptrajjob.addArguments("-a", "amber_ptraj_%s" % charge)
                ptrajjob.addArguments("-i", topfile.name, ptraj_conf.name, prod_dcd.name)

                task_label = "amber-ptraj"

                for output_file in [ "ptraj_fit", "ptraj_dcd" ]:
                    ptrajjob.addArguments(self.keg_params.output_file(task_label, output_file, eval(output_file).name))

                self.keg_params.add_keg_params(ptrajjob, task_label)

            else:
                ptrajjob.addArguments(topfile)
                ptrajjob.setStdin(ptraj_conf)

            ptrajjob.uses(topfile, link=Link.INPUT)
            ptrajjob.uses(ptraj_conf, link=Link.INPUT)
            ptrajjob.uses(prod_dcd, link=Link.INPUT)
            ptrajjob.uses(ptraj_fit, link=Link.OUTPUT, transfer=True)
            ptrajjob.uses(ptraj_dcd, link=Link.OUTPUT, transfer=True)
            ptrajjob.profile("globus", "jobtype", "single")
            ptrajjob.profile("globus", "maxwalltime", self.getconf("ptraj_maxwalltime"))
            ptrajjob.profile("globus", "count", self.getconf("ptraj_cores"))
            dax.addJob(ptrajjob)
            dax.depends(ptrajjob, prodjob)

            # sassena incoherent job
            incojob = Job("sassena", node_label="sassena_inc_%s" % charge)
            if self.is_synthetic_workflow:
                incojob.addArguments("-p", "--config", incoherent_conf)
                incojob.addArguments("-a", "sassena_inc_%s" % charge)
                incojob.addArguments("-i", incoherent_conf.name, ptraj_dcd.name, incoherent_db.name, coordinates.name)

                task_label = "sassena-inc"

                incojob.addArguments(self.keg_params.output_file(task_label, "fqt_incoherent", fqt_incoherent.name))

                self.keg_params.add_keg_params(incojob, task_label)
            else:
                incojob.addArguments("--config", incoherent_conf)

            incojob.uses(incoherent_conf, link=Link.INPUT)
            incojob.uses(ptraj_dcd, link=Link.INPUT)
            incojob.uses(incoherent_db, link=Link.INPUT)
            incojob.uses(coordinates, link=Link.INPUT)
            incojob.uses(fqt_incoherent, link=Link.OUTPUT, transfer=True)

            if self.is_synthetic_workflow:
                incojob.profile("globus", "jobtype", "mpi")
                incojob.profile("globus", "maxwalltime", "6")
                incojob.profile("globus", "count", "8")
            else:
                incojob.profile("globus", "jobtype", "mpi")
                incojob.profile("globus", "maxwalltime", self.getconf("sassena_maxwalltime"))
                incojob.profile("globus", "count", self.getconf("sassena_cores"))

            dax.addJob(incojob)
            dax.depends(incojob, ptrajjob)
            dax.depends(incojob, untarjob)

            # sassena coherent job
            cojob = Job("sassena", node_label="sassena_coh_%s" % charge)
            if self.is_synthetic_workflow:
                cojob.addArguments("-p", "--config", coherent_conf)
                cojob.addArguments("-a", "sassena_coh_%s" % charge)
                cojob.addArguments("-i", coherent_conf.name, ptraj_dcd.name, coherent_db.name, coordinates.name)

                task_label = "sassena-coh"

                cojob.addArguments(self.keg_params.output_file(task_label, "fqt_coherent", fqt_coherent.name))

                self.keg_params.add_keg_params(cojob, task_label)

            else:
                cojob.addArguments("--config", coherent_conf)

            cojob.uses(coherent_conf, link=Link.INPUT)
            cojob.uses(ptraj_dcd, link=Link.INPUT)
            cojob.uses(coherent_db, link=Link.INPUT)
            cojob.uses(coordinates, link=Link.INPUT)
            cojob.uses(fqt_coherent, link=Link.OUTPUT, transfer=True)

            if self.is_synthetic_workflow:
                cojob.profile("globus", "jobtype", "mpi")
                cojob.profile("globus", "maxwalltime", "6")
                cojob.profile("globus", "count", "8")
            else:
                cojob.profile("globus", "jobtype", "mpi")
                cojob.profile("globus", "maxwalltime", self.getconf("sassena_maxwalltime"))
                cojob.profile("globus", "count", self.getconf("sassena_cores"))

            dax.addJob(cojob)
            dax.depends(cojob, prodjob)
            dax.depends(cojob, untarjob)

        # Write the DAX file
        dax.writeXMLFile(self.daxfile)