コード例 #1
0
    def step_sample_initiation_bysample(self):

        # Creating holder for output:
        for sample in self.sample_data[
                "samples"]:  # Getting list of samples out of samples_hash
            # Make sure a file exists in the sample equivalent to dbtype:
            try:
                dbtype = self.params["redir_params"]["-dbtype"]
                self.sample_data[sample]["fasta"][dbtype]
            except KeyError:
                raise AssertionExcept(
                    "No file exists in sample for specified -dbtype (%s)\n" %
                    dbtype, sample)
            # initialize blast and blastdb slots for sample:
            if not "blast" in self.sample_data[sample].keys():
                self.sample_data[sample]["blast"] = dict()
            if not "blastdb" in self.sample_data[sample]["blast"].keys():
                self.sample_data[sample]["blast"]["blastdb"] = dict()
コード例 #2
0
    def step_sample_initiation(self):
        """ A place to do initiation stages following setting of sample_data
            Here you should do testing for dependency output. These will NOT exist at initiation of this instance. They are set only following sample_data updating
        """

        # # If does not exist
        # try:
        # self.sample_data["qiime"]
        # except KeyError:
        # raise AssertionExcept("It seems like qiime_demult is the first qiime step. At the moment, it must come after qiime_prep...\n" )

        try:
            self.sample_data["fasta.nucl"]
        except KeyError:
            raise AssertionExcept("fasta file does not exist.\n")

        try:
            self.sample_data["otu_table"]
        except KeyError:
            self.write_warning("otu table does not exist.\n")
コード例 #3
0
    def build_scripts(self):

        # Name of specific script:
        self.spec_script_name = "_".join(
            [self.step, self.name, self.sample_data["Title"]])

        self.script = ""

        # This line should be left before every new script. It sees to local issues.
        # Use the dir it returns as the base_dir for this step.
        use_dir = self.local_start(self.base_dir)

        if "fasta.aligned" not in self.sample_data.keys():
            raise AssertionExcept(
                "You are trying to run 'make_phylogeny' on an unaligned fasta file!\n"
            )
        outfile = os.path.basename(self.sample_data["fasta.nucl"])
        outfile = re.sub("\.(fas|fasta|fna|fa)$", "", outfile) + ".tre"
        logfile = ".".join([outfile, "log"])

        ### Step 1b: Adding demultiplexing tyo script:
        self.script += self.get_script_const(
        )  # Gets the "env", "script_path" and "redir_params" part of the script which is always the same...

        self.script += "-i %s \\\n\t" % self.sample_data["fasta.nucl"]
        # self.script += "-o %s \n\n" % self.base_dir
        self.script += "-o %s \\\n\t" % "".join([use_dir, outfile])
        self.script += "-l %s \n\n" % "".join([use_dir, logfile])

        # Move all files from temporary local dir to permanent base_dir
        self.local_finish(
            use_dir, self.base_dir
        )  # Sees to copying local files to final destination (and other stuff)

        # Store location of the phylogenetic tree:
        self.sample_data["phylotree"] = self.base_dir + outfile

        # self.stamp_dir_files(self.base_dir)

        self.create_low_level_script()
コード例 #4
0
    def step_sample_initiation(self):
        """ A place to do initiation stages following setting of sample_data
        """

        if self.params["scope"] == "sample":
            for sample in self.sample_data["samples"]:
                if self.params["input"] == "vcf":
                    try:
                        self.sample_data[sample]["vcf"]
                    except KeyError:
                        raise AssertionExcept(
                            "Sample does not have a VCF variants file.",
                            sample)
                elif self.params["input"] == "bcf":
                    try:
                        self.sample_data[sample]["bcf"]
                    except KeyError:
                        raise AssertionExcept(
                            "Sample does not have a BCF variants file.",
                            sample)
                else:
                    try:
                        self.sample_data[sample]["gzVCF"]
                    except KeyError:
                        raise AssertionExcept(
                            "Sample does not have a gzVCF variants file.",
                            sample)
        else:  # Scope == project
            if self.params["input"] == "vcf":
                try:
                    self.sample_data["vcf"]
                except KeyError:
                    raise AssertionExcept(
                        "Sample does not have a VCF variants file.", sample)
            elif self.params["input"] == "bcf":
                try:
                    self.sample_data["bcf"]
                except KeyError:
                    raise AssertionExcept(
                        "Sample does not have a BCF variants file.", sample)
            else:
                try:
                    self.sample_data["gzVCF"]
                except KeyError:
                    raise AssertionExcept(
                        "Sample does not have a gzVCF variants file.", sample)
コード例 #5
0
    def step_sample_initiation(self):
        """ A place to do initiation stages following setting of sample_data
            Here you should do testing for dependency output. These will NOT exist at initiation of this instance. They are set only following sample_data updating
        """

        # Testing the existance of a legitimate mapping file in the pipeline parameter file or in self.sample_data:
        # Check if mapping file exists in parameters (overrides mapping from sample_data)
        if "--mapping_fp" in self.params["redir_params"].keys(
        ) or "-m" in self.params["redir_params"].keys():
            # Check if mapping file exists in sample_data
            if "qiime.mapping" in self.sample_data.keys():
                self.write_warning(
                    "Overriding existing mapping file. Make sure this is OK")
                # mapping_fp = self.sample_data["qiime.mapping"]

            self.sample_data["qiime.mapping"] = self.params["redir_params"][
                "--mapping_fp"] if "--mapping_fp" in self.params[
                    "redir_params"].keys(
                    ) else self.params["redir_params"]["-m"]
        else:
            if "qiime.mapping" not in self.sample_data.keys():
                raise AssertionExcept(
                    "No mapping file exists nor was it passed with -m")
コード例 #6
0
    def step_sample_initiation(self):
        """ A place to do initiation stages following setting of sample_data
        """

        # Initializing a "mapping" dict for each sample:
        for sample in self.sample_data[
                "samples"]:  # Getting list of samples out of samples_hash

            try:
                self.sample_data[sample]["fastq"]["mapping"]
            except KeyError:
                self.sample_data[sample]["fastq"]["mapping"] = {}
            else:
                self.write_warning(
                    "mapping dict exists for sample %s. Double mapping steps?\n",
                    sample)

        # Require either 'scope' or '-x':
        if "scope" in self.params:
            # If scope defined, comment if also -x exists.
            if "-x" in self.params["redir_params"]:
                raise AssertionExcept("Both 'scope' and '-x' specified!\n")

            try:
                # Loop over samples to set the reference genome:
                for sample in self.sample_data["samples"]:
                    if self.params["scope"] == "project":
                        # Set project wide reference:
                        self.sample_data[sample]["fastq"]["mapping"][
                            "reference"] = self.sample_data["bowtie2"]["fasta"]
                    elif self.params["scope"] == "sample":
                        # Set per-sample reference:
                        self.sample_data[sample]["fastq"]["mapping"][
                            "reference"] = self.sample_data[sample]["bowtie2"][
                                "fasta"]
                    else:
                        raise AssertionExcept(
                            "Scope must be either 'sample' or 'project'")

            except KeyError:
                raise AssertionExcept(
                    "There is a mismatch between 'scope' and the existing bowtie2 index\n",
                    sample)

            if "ref_genome" in self.params.keys():
                raise AssertionExcept(
                    "ref_genome was passed, and 'scope' was defined. Ignoring ref_genome\n"
                )
        else:
            # If scope is not defined, require '-x'
            if not "-x" in self.params["redir_params"]:
                raise AssertionExcept("Neither 'scope' nor '-x' specified.\n")
            # Storing reference genome for use by downstream steps:
            if "ref_genome" in self.params.keys():
                for sample in self.sample_data["samples"]:
                    # If reference already exists, ignore ref_genome
                    if "reference" in self.sample_data[sample]["fastq"][
                            "mapping"]:
                        self.write_warning(
                            "ref_genome was passed, but a reference already exists. Setting reference to 'ref_genome'\n"
                        )

                    self.sample_data[sample]["fastq"]["mapping"][
                        "reference"] = self.params["ref_genome"]
            else:
                self.write_warning(
                    "No reference given. It is highly recommended to give one!\n"
                )
コード例 #7
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """
        
        
        # Each iteration must define the following class variables:
            # self.spec_script_name
            # self.script
        for sample in self.sample_data["samples"]:      # Getting list of samples out of samples_hash

            # Make a dir for the current sample:
            sample_dir = self.make_folder_for_sample(sample)

            # Name of specific script:
            self.spec_script_name = "_".join([self.step,self.name,sample])
            self.script = ""
            

            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(sample_dir)
            
            

            # sam_file = self.sample_data[sample]["fastq"]["mapping"]["sam"]
            input_file = self.sample_data[sample]["fastq"]["mapping"][self.file2use]
            # bam_name = self.sample_data[sample]["fastq"]["mapping"]["sam"] + ".bam"
            bam_name = os.path.basename(input_file) + ".bam"
            output_sam_name = os.path.basename(input_file) + ".sam" #might be used...
            
            if "filter_by_tag" in self.params.keys():
                filtered_name = bam_name + ".filt.bam"
                sort_name = filtered_name + ".srt.bam"
            else:
                sort_name = bam_name + ".srt.bam"
            index_name = sort_name + ".bai"

            
            if "view" in self.params.keys():
                self.script += "###########\n# Running samtools view:\n#----------------\n"
                self.script += "%s view \\\n\t" % self.get_script_env_path()
                if self.params["view"]:
                    self.script += "%s \\\n\t" % self.params["view"]

                tobam = re.search("\-\w*b",self.params["view"])
                if tobam:
                    self.script += "-o %s \\\n\t %s\n\n" % (use_dir + bam_name,input_file)
                    self.sample_data[sample]["fastq"]["mapping"]["bam"] = sample_dir + bam_name
                else:
                    self.script += "-o %s \\\n\t %s\n\n" % (use_dir + output_sam_name,input_file)
                    self.sample_data[sample]["fastq"]["mapping"]["sam"] = sample_dir + output_sam_name
                    self.stamp_file(self.sample_data[sample]["fastq"]["mapping"]["sam"])

                    self.write_warning("Output from samtools view is SAM. Not proceeding further.\nTo produce a BAM, make sure to include the -b flag in the samtools view parameters.\n")
                    # If sam output, can't proceed with rest of commands which require bam input_file:
                    # Move all files from temporary local dir to permanent base_dir
                    self.local_finish(use_dir,sample_dir)       # Sees to copying local files to final destination (and other stuff)
                    self.create_low_level_script()
                    continue
               
            # The following can be merged into the main 'view' section
            if "filter_by_tag" in self.params.keys():
            
                self.script += "###########\n# Filtering BAM\n#----------------\n"
                self.script += "\n\n"
                self.script += "%s view \\\n\t" % self.get_script_env_path()
                self.script += "-h \\\n\t" 
                self.script += "%s | \\\n\t" % self.sample_data[sample]["fastq"]["mapping"]["bam"]
                self.script += "awk '$0 ~\"(^@)|(%s)\"' | \\\n\t" % self.params["filter_by_tag"]
                self.script += "%s view \\\n\t" % self.get_script_env_path()
                self.script += "-bh \\\n\t" 
                self.script += "-o %s \\\n\t" % (use_dir + filtered_name)
                self.script += "- \n\n" 

                # If user requires than unsorted bam be removed:
                if "del_unfiltered" in self.params.keys():
                    self.script += "###########\n# Removing unfiltered BAM\n#----------------\n"
                    self.script += "\n\nrm -rf %s\n\n" % (use_dir + bam_name)

                # Stroing filtered and unfiltered bams:
                self.sample_data[sample]["fastq"]["mapping"]["unfiltered_bam"] = sample_dir + bam_name
                self.sample_data[sample]["fastq"]["mapping"]["bam"] = sample_dir + filtered_name

                # The following is so that sort will work on the filtered file without playing around with the sort code:
                bam_name = filtered_name
                
            if "sort" in self.params.keys():
                # This permits running only sort and index, in case a bam file was produced in a differnet step.
                if "view" in self.params.keys():
                    bam_name = use_dir + bam_name
                else:
                    if "bam" in self.sample_data[sample]["fastq"]["mapping"].keys():
                        bam_name = self.sample_data[sample]["fastq"]["mapping"]["bam"]
                    elif "sam" in self.sample_data[sample]["fastq"]["mapping"].keys():
                        bam_name = self.sample_data[sample]["fastq"]["mapping"]["sam"]
                        self.write_warning("Can't find BAM but found SAM for sample. Using it instead of a BAM.\n", sample)
                    else:
                        raise AssertionExcept("Can't run sort without BAM file. Either include 'view' or use other BAM creating steps.\n",sample)
                self.script += "###########\n# Sorting BAM\n#----------------\n"
                self.script += "%s sort \\\n\t" % self.get_script_env_path()
                if self.params["sort"]:
                    self.script += "%s \\\n\t" % self.params["sort"]
                self.script += "-o %s \\\n\t" % (use_dir + sort_name)
                self.script += "%s\n\n" % (bam_name)
                # Storing sorted bam in 'bam' slot and unsorted bam in unsorted_bam slot
                self.sample_data[sample]["fastq"]["mapping"]["unsorted_bam"] = sample_dir + os.path.basename(bam_name)
                self.sample_data[sample]["fastq"]["mapping"]["bam"] = sample_dir + sort_name

                # If user requires than unsorted bam be removed:
                if "del_unsorted" in self.params.keys():
                    self.script += "###########\n# Removing unsorted BAM\n#----------------\n"
                    self.script += "\n\nrm -rf %s\n\n" % (bam_name)
                    
                bam_name = sort_name  # Use sorted bam from now on below
                    
            if "index" in self.params.keys():
                self.script += "###########\n# Indexing BAM\n#----------------\n"
                self.script += "%s index \\\n\t" % self.get_script_env_path()
                if self.params["index"]:
                    self.script += "%s \\\n\t" % self.params["index"]
                self.script += "%s\n\n" % (use_dir + bam_name)
                self.sample_data[sample]["fastq"]["mapping"]["index"] = sample_dir + index_name
        
            if "flagstat" in self.params.keys():
                self.script += "###########\n# Calculating BAM statistics:\n#----------------\n"
                self.script += "%s flagstat \\\n\t" % self.get_script_env_path()
                self.script += "%s \\\n\t" % (use_dir + bam_name)
                self.script += "> %s.flagstat \n\n" % (use_dir + bam_name)
                self.sample_data[sample]["fastq"]["mapping"]["flagstat"] = "%s%s.flagstat" % (sample_dir, bam_name)
        
            if "stats" in self.params.keys():
                self.script += "###########\n# Calculating BAM statistics:\n#----------------\n"
                self.script += "%s stats \\\n\t" % self.get_script_env_path()
                if self.params["stats"]:  # Adding parameters the user might pass
                    self.script += "%s \\\n\t" % self.params["stats"]
                self.script += "%s \\\n\t" % (use_dir + bam_name)
                self.script += "> %s.stats \n\n" % (use_dir + bam_name)
                self.sample_data[sample]["fastq"]["mapping"]["stats"] = "%s%s.stats" % (sample_dir, bam_name)
                
            if "idxstats" in self.params.keys():
                self.script += "###########\n# Calculating index statistics (idxstats):\n#----------------\n"
                self.script += "%s idxstats \\\n\t" % self.get_script_env_path()
                # idxstats has no uder defined parameters...
                self.script += "%s \\\n\t" % (use_dir + bam_name)
                self.script += "> %s.idxstat.tab \n\n" % (use_dir + bam_name)
                self.sample_data[sample]["fastq"]["mapping"]["stats"] = "%s%s.stats" % (sample_dir, bam_name)
                self.sample_data[sample]["fastq"]["mapping"]["idxstats"] = "%s%s.idxstat.tab" % (sample_dir, bam_name)
                
                
            if "del_sam" in self.params.keys() and "sam" in self.sample_data[sample]["fastq"]["mapping"]:
                self.script += "###########\n# Removing SAM\n#----------------\n\n"
                self.script += "rm -rf %s\n\n" % self.sample_data[sample]["fastq"]["mapping"]["sam"]

            

            # self.stamp_dir_files(sample_dir)
            # Move all files from temporary local dir to permanent base_dir
            self.local_finish(use_dir,sample_dir)       # Sees to copying local files to final destination (and other stuff)
            
            
            self.create_low_level_script()
                    
コード例 #8
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """

        if self.params["scope"] == "project":

            # Name of specific script:
            self.spec_script_name = "_".join(
                [self.step, self.name, self.sample_data["Title"]])
            self.script = ""

            # Make a dir for the current sample:
            output_file = os.sep.join(
                [self.base_dir, self.sample_data["Title"] + self.file_tag])
            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(self.base_dir)

            self.script += self.get_script_const()
            self.script += "-o %s \\\n\t" % output_file

            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash

                if "PE" in self.sample_data[sample][
                        "type"] and "SE" in self.sample_data[sample]["type"]:
                    print >> sys.stdout, "CLC assembler not defined for PE-SE mixes. Using PE file only..."
                if "PE" in self.sample_data[sample]["type"]:
                    try:
                        self.script += "-p %s \\\n\t-q \\\n\t" % self.params[
                            "p"]
                    except KeyError:
                        raise AssertionExcept(
                            "With paired end reads, you must specify a 'p' parameter containing information to pass with '-p' to clc_assembler. See the clc manual."
                        )
                    self.script += "-i %s %s \\\n\t" % (
                        self.sample_data[sample]["fastq.F"],
                        self.sample_data[sample]["fastq.R"])
                elif "SE" in self.sample_data[sample]["type"]:
                    self.script += "-p no \\\n\t-q \\\n\t%s \\\n\t" % self.sample_data[
                        sample]["fastq.S"]
                else:  # Mixed!!
                    pass

            # Remove trailing '\\\n\t' from last iteration and add some newlines for clarity
            self.script = self.script.rstrip("\\\n\t") + "\n\n"

            # Store results to fasta and assembly slots:
            self.sample_data["fasta.nucl"] = output_file
            self.sample_data[self.step + ".contigs"] = output_file
            self.sample_data["assembler"] = self.get_step_step()

            self.stamp_file(self.sample_data[self.step + ".contigs"])

            # Wrapping up function. Leave these lines at the end of every iteration:
            self.local_finish(
                use_dir, self.base_dir
            )  # Sees to copying local files to final destination (and other stuff)

            self.create_low_level_script()

        else:

            # Each iteration must define the following class variables:
            # spec_script_name
            # script
            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash

                # Name of specific script:
                self.spec_script_name = "_".join(
                    [self.step, self.name, sample])
                self.script = ""

                # Make a dir for the current sample:
                sample_dir = self.make_folder_for_sample(sample)
                output_file = os.sep.join([sample_dir, sample + self.file_tag])
                # This line should be left before every new script. It sees to local issues.
                # Use the dir it returns as the base_dir for this step.
                use_dir = self.local_start(sample_dir)

                self.script += self.get_script_const()
                self.script += "-o %s \\\n\t" % output_file

                if "mixed" in self.sample_data[sample]["type"]:
                    print >> sys.stdout, "CLC assembler not defined for PE-SE mixes. Using PE file only..."
                if "PE" in self.sample_data[sample][
                        "type"] or "mixed" in self.sample_data[sample]["type"]:
                    self.script += "-p %s \\\n\t-q \\\n\t" % self.params["p"]
                    self.script += "-i %s %s\n\n" % (
                        self.sample_data[sample]["fastq.F"],
                        self.sample_data[sample]["fastq.R"])
                elif "SE" in self.sample_data[sample]["type"]:
                    self.script += "-p no \\\n\t-q \\\n\t%s\n\n" % self.sample_data[
                        sample]["fastq.S"]
                else:  # Mixed!!
                    pass

                # Store results to fasta and assembly slots:
                self.sample_data[sample]["fasta.nucl"] = output_file
                self.sample_data[sample][self.step + ".contigs"] = output_file
                self.sample_data[sample]["assembler"] = self.get_step_step()

                self.stamp_file(self.sample_data[sample][self.step +
                                                         ".contigs"])

                # Wrapping up function. Leave these lines at the end of every iteration:
                self.local_finish(
                    use_dir, sample_dir
                )  # Sees to copying local files to final destination (and other stuff)

                self.create_low_level_script()
コード例 #9
0
    def build_scripts(self):

        # Each iteration must define the following class variables:
        # spec_script_name
        # script
        for sample in self.sample_data[
                "samples"]:  # Getting list of samples out of samples_hash
            # General comment: If there is a parallel routine for each direction (forward, reverse), add this loop
            # if  in self.sample_data[sample].keys():

            # Loop over all **existing** Forward, Reverse and Single slots:
            # The filter returns a list of keys in sample_data that are in the list ["Forward","Reverse","Single"]
            for direction in filter(
                    lambda x: x in ["Forward", "Reverse", "Single"],
                    self.sample_data[sample].keys()):
                self.script = ""
                direction_tag = direction[0]  # Get first letter in direction
                # Name of specific script:
                self.spec_script_name = "_".join(
                    [self.step, self.name, sample, direction_tag])

                # This line should be left before every new script. It sees to local issues.
                # Use the dir it returns as the base_dir for this step.
                use_dir = self.local_start(self.base_dir)

                # Get all unique extensions of files in direction:
                extensions = list(
                    set([
                        os.path.splitext(fn)[1]
                        for fn in self.sample_data[sample][direction]
                    ]))

                # Find file extension of first input file and remove extra period at the begining of extension (note the [1:] at the end.):
                extension = os.path.splitext(
                    self.sample_data[sample][direction][0])[1][1:]
                # Remove zip extension:
                if "." + extension in ZIPPED_EXTENSIONS:
                    # Get last extension before the '.gz', and remove the leading period (note the [1:] at the end.)
                    extension = os.path.splitext(
                        os.path.splitext(
                            self.sample_data[sample][direction][0])[0])[1][1:]
                if "." + extension not in KNOWN_FILE_EXTENSIONS:
                    raise AssertionExcept(
                        "One of the files has a really weird extension (%s). Make sure this is not a mistake, or update KNOWN_FILE_EXTENSIONS or ZIPPED_EXTENSIONS in global_def.py\n"
                        % extension, sample)

                fq_fn = ".".join(
                    [sample, direction_tag, self.file_tag, extension]
                )  #The filename containing the end result. Used both in script and to set reads in $sample_params

                self.script += self.params["script_path"] + " \\\n\t"
                # The following line concatenates all the files in the direction separated by a " "
                self.script += " ".join(self.sample_data[sample][direction])
                self.script += " \\\n\t"
                if "pipe" in self.params:
                    self.script += "| {pipe} \\\n\t".format(
                        pipe=self.params["pipe"])
                self.script += "> %s%s \n\n" % (use_dir, fq_fn)

                # Move all files from temporary local dir to permanent base_dir
                self.local_finish(
                    use_dir, self.base_dir
                )  # Sees to copying local files to final destination (and other stuff)

                # Store file in active file for sample:
                self.sample_data[sample]["fastq." +
                                         direction_tag] = self.base_dir + fq_fn

                self.stamp_file(self.sample_data[sample]["fastq." +
                                                         direction_tag])

                self.create_low_level_script()

            # Merging files in "fasta" dict in sample_data (genomes etc.)
            # Loop over all **existing** fasta slots:
            # The filter returns a list of keys in sample_data that are in the keys of dict "fasta_types_dict"
            for direction in filter(lambda x: x in fasta_types_dict.keys(),
                                    self.sample_data[sample].keys()):
                self.script = ""
                direction_tag = fasta_types_dict[direction]

                # Name of specific script:
                self.spec_script_name = "_".join(
                    [self.step, self.name, sample, direction_tag])

                # This line should be left before every new script. It sees to local issues.
                # Use the dir it returns as the base_dir for this step.
                use_dir = self.local_start(self.base_dir)

                # Get all unique extensions of files in direction:
                extensions = list(
                    set([
                        os.path.splitext(fn)[1]
                        for fn in self.sample_data[sample][direction]
                    ]))

                # Find file extension of first input file and remove extra period at the begining of extension (note the [1:] at the end.):
                extension = os.path.splitext(
                    self.sample_data[sample][direction][0])[1][1:]
                # Remove zip extension:
                if "." + extension in ZIPPED_EXTENSIONS:
                    # Get last extension before the '.gz', and remove the leading period (note the [1:] at the end.)
                    extension = os.path.splitext(
                        os.path.splitext(
                            self.sample_data[sample][direction][0])[0])[1][1:]
                if "." + extension not in KNOWN_FILE_EXTENSIONS:
                    raise AssertionExcept(
                        "One of the files in sample has a really weird extension (%s). \n\tMake sure this is not a mistake, or update KNOWN_FILE_EXTENSIONS\n"
                        % extension, sample)

                fq_fn = ".".join(
                    [sample, direction_tag, self.file_tag, extension]
                )  #The filename containing the end result. Used both in script and to set reads in $sample_params

                # You have to add "use existing" functionality
                self.script += self.params["script_path"] + " \\\n\t"
                # The following line concatenates all the files in the direction separated by a " "
                self.script += " ".join(self.sample_data[sample][direction])
                self.script += " \\\n\t"
                if "pipe" in self.params:
                    self.script += "| {pipe} \\\n\t".format(
                        pipe=self.params["pipe"])
                self.script += "> %s%s \n\n" % (use_dir, fq_fn)

                # # Store file in active file for sample:
                self.sample_data[sample][direction_tag] = self.base_dir + fq_fn

                self.stamp_file(self.sample_data[sample][direction_tag])

                # Move all files from temporary local dir to permanent base_dir
                self.local_finish(
                    use_dir, self.base_dir
                )  # Sees to copying local files to final destination (and other stuff)

                self.create_low_level_script()

            for direction in filter(lambda x: x in sam_bam_dict.keys(),
                                    self.sample_data[sample].keys()):
                # Do not attempt merging the single reference permitted:
                if direction == "REFERENCE":
                    continue

                self.script = ""
                direction_tag = sam_bam_dict[direction]

                # Name of specific script:
                self.spec_script_name = "_".join(
                    [self.step, self.name, sample, direction_tag])

                # This line should be left before every new script. It sees to local issues.
                # Use the dir it returns as the base_dir for this step.
                use_dir = self.local_start(self.base_dir)

                # Get all unique extensions of files in direction:
                extensions = list(
                    set([
                        os.path.splitext(fn)[1]
                        for fn in self.sample_data[sample][direction]
                    ]))

                # Find file extension of first input file and remove extra period at the begining of extension (note the [1:] at the end.):
                extension = os.path.splitext(
                    self.sample_data[sample][direction][0])[1][1:]
                # Remove zip extension:
                if "." + extension in ZIPPED_EXTENSIONS:
                    # Get last extension before the '.gz', and remove the leading period (note the [1:] at the end.)
                    extension = os.path.splitext(
                        os.path.splitext(
                            self.sample_data[sample][direction][0])[0])[1][1:]
                if "." + extension not in KNOWN_FILE_EXTENSIONS:
                    raise AssertionExcept(
                        "One of the files in sample has a really weird extension (%s). \n\tMake sure this is not a mistake, or update KNOWN_FILE_EXTENSIONS\n"
                        % extension, sample)

                fq_fn = ".".join(
                    [sample, direction_tag, self.file_tag, extension]
                )  #The filename containing the end result. Used both in script and to set reads in $sample_params

                # You have to add "use existing" functionality
                self.script += self.params["script_path"] + " \\\n\t"
                # The following line concatenates all the files in the direction separated by a " "
                self.script += " ".join(self.sample_data[sample][direction])
                self.script += " \\\n\t"
                if "pipe" in self.params:
                    self.script += "| {pipe} \\\n\t".format(
                        pipe=self.params["pipe"])
                self.script += " > %s%s \n\n" % (use_dir, fq_fn)

                # # Store file in active file for sample:

                self.sample_data[sample][direction_tag] = self.base_dir + fq_fn
                self.sample_data[sample]["reference"] = self.sample_data[
                    sample]["REFERENCE"]

                self.stamp_file(self.sample_data[sample][direction_tag])

                # Move all files from temporary local dir to permanent base_dir
                self.local_finish(
                    use_dir, self.base_dir
                )  # Sees to copying local files to final destination (and other stuff)

                self.create_low_level_script()
コード例 #10
0
    def step_sample_initiation(self):
        """ A place to do initiation stages following setting of sample_data
        """
        if len(get_File_Type_data(self.params, ["copy_File_Types"])) > 0:
            for transfer in self.params["copy_File_Types"]:
                dif = set([
                    "source",
                    "target",
                ]).difference(self.params["copy_File_Types"][transfer].keys())
                if len(dif) == 0:
                    scope_in = get_File_Type_data(
                        self.params["copy_File_Types"],
                        [transfer, "source", "scope"], "sample")
                    scope_out = get_File_Type_data(
                        self.params["copy_File_Types"],
                        [transfer, "target", "scope"], "sample")
                    File_Type_in = get_File_Type_data(
                        self.params["copy_File_Types"],
                        [transfer, "source", "File_Type"], None)
                    File_Type_out = get_File_Type_data(
                        self.params["copy_File_Types"],
                        [transfer, "target", "File_Type"], None)

                    if (File_Type_in and File_Type_out) != None:
                        if "sample" in [scope_in, scope_out]:
                            for sample in self.sample_data["samples"]:
                                if scope_in == "sample":
                                    if File_Type_in in self.sample_data[
                                            sample].keys():
                                        if scope_out == "sample":
                                            self.sample_data[sample][
                                                File_Type_out] = self.sample_data[
                                                    sample][File_Type_in]
                                        else:
                                            self.sample_data[
                                                File_Type_out] = self.sample_data[
                                                    sample][File_Type_in]
                                    else:
                                        raise AssertionExcept(
                                            "The File_Type %s is not found in the SAMPLE level \n\t File_Types available are : %%s"
                                            % File_Type_in %
                                            self.sample_data[sample].keys())
                                else:
                                    if File_Type_in in self.sample_data.keys():
                                        self.sample_data[sample][
                                            File_Type_out] = self.sample_data[
                                                File_Type_in]
                                    else:
                                        raise AssertionExcept(
                                            "The File_Type %s is not found in the PROJECT level \n\t File_Types available are : %%s"
                                            % File_Type_in %
                                            self.sample_data.keys())
                        else:
                            if File_Type_in in self.sample_data.keys():
                                self.sample_data[
                                    File_Type_out] = self.sample_data[
                                        File_Type_in]
                            else:
                                raise AssertionExcept(
                                    "The File_Type %s is not found in the PROJECT level \n\t File_Types available are : %%s"
                                    % File_Type_in % self.sample_data.keys())
                    else:
                        if File_Type_in == None:
                            raise AssertionExcept(
                                "The following argument/s are missing or empty in the copy_File_Types section: %s"
                                % "source File_Type")
                        if File_Type_out == None:
                            raise AssertionExcept(
                                "The following argument/s are missing or empty in the copy_File_Types section: %s"
                                % "target File_Type")
                else:
                    raise AssertionExcept(
                        "The following argument/s are missing in the copy_File_Types section: %s"
                        % list(dif))

        if "scope" in self.params.keys():
            if "project" in self.params["scope"]:
                self.step_sample_initiation_byproject()
            else:
                self.step_sample_initiation_bysample()
        else:
            self.step_sample_initiation_bysample()
        pass
コード例 #11
0
    def step_specific_init(self):
        self.shell = "bash"  # Can be set to "bash" by inheriting instances
        # self.file_tag = "Bowtie_mapper"

        # if "--genomeDir" not in self.params["redir_params"]:
        # raise AssertionExcept("No --genomeDir specified. You must specify a STAR index of the genome.")

        if "ref_genome" not in self.params.keys():
            self.write_warning(
                "No reference given with 'ref_genome' (path to fasta file). It is highly recommended to give one!\n"
            )

        if "--runDirPerm" not in self.params["redir_params"]:
            self.params["redir_params"]["--runDirPerm"] = "All_RWX"
            self.write_warning("No --runDirPerm specified. Using 'All_RWX'")

        if "--outSAMtype" in self.params["redir_params"]:
            outSAMtype = re.split("\s+",
                                  self.params["redir_params"]["--outSAMtype"])
            if outSAMtype[0] not in ["SAM", "BAM", "None"]:
                raise AssertionExcept(
                    "Bad value for --outSAMtype: Has to be 'BAM', 'SAM' or 'None'"
                )
            self.output_type = outSAMtype[0]
            if self.output_type == "BAM":
                self.bam_types = outSAMtype[1:]
                if "Unsorted" not in self.bam_types and "SortedByCoordinate" not in self.bam_types:
                    raise AssertionExcept(
                        "When --outSAMtype is BAM, you must supply a type: 'Unsorted', 'SortedByCoordinate' or both."
                    )
        else:
            self.output_type = "SAM"

        if "--outSAMattrRGline" in self.params["redir_params"]:
            if re.match("ID\:\S+",
                        self.params["redir_params"]["--outSAMattrRGline"]):
                self.write_warning(
                    "Removing 'ID:' from --outSAMattrRGline line!")
            self.params["outSAMattrRGline"] = re.sub(
                "ID\:\S+", "",
                self.params["redir_params"]["--outSAMattrRGline"])

        if "--outWigType" in self.params["redir_params"]:
            outWigType = re.split("\s+",
                                  self.params["redir_params"]["--outWigType"])
            if outWigType[0] not in ['None', 'bedGraph', 'wiggle']:
                raise AssertionExcept(
                    "Bad value for --outWigType: Has to be 'None', 'bedGraph' or 'wiggle'"
                )
            self.wig_type = outWigType[0]
            if self.wig_type == "wiggle":
                # See in build_scripts below. STAR produces 4 different wig files. Storing one in wig slot and others in wig* slots.
                self.write_warning(
                    "Saving UniqueMultiple wig from strand 1 as main 'WIG' file. If you want something else, you have to move it to the right slot..."
                )
            elif self.wig_type == "bedGraph":
                self.write_warning(
                    "Saving UniqueMultiple bedGraph from strand 1 as main 'bdg' file. If you want something else, you have to move it to the right slot..."
                )

        else:
            self.wig_type = "None"

        if "--quantMode" in self.params["redir_params"] and self.params[
                "redir_params"]["--quantMode"] == "GeneCounts":
            self.write_warning(
                "--quantMode GeneCounts is not supported yet. The script will run but the output will not be stored in the file type index"
            )

        # print self.output_type, self.bam_types
        # sys.exit()

        for redir2remove in [
                "--readFilesCommand", "--readFilesIn", "--outFileNamePrefix",
                "--outTmpDir", "--outStd"
        ]:
            if redir2remove in self.params["redir_params"]:
                del self.params["redir_params"][redir2remove]
                self.write_warning(
                    "You are not supposed to specify %s in redirects. We set it automatically"
                    % redir2remove)
コード例 #12
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """

        if self.params["scope"] == "project":

            # Not defined yet
            raise AssertionExcept("project wide scope is not defined yet\n")

        else:

            # Each iteration must define the following class variables:
            # spec_script_name
            # script
            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash

                # Name of specific script:
                self.spec_script_name = "_".join(
                    [self.step, self.name, sample])
                self.script = ""

                # Make a dir for the current sample:
                sample_dir = self.make_folder_for_sample(sample)

                # This line should be left before every new script. It sees to local issues.
                # Use the dir it returns as the base_dir for this step.
                use_dir = self.local_start(sample_dir)

                self.script += self.get_script_const()
                self.script += "-o %s \\\n\t" % sample_dir

                if "PE" in self.sample_data[sample]["type"]:
                    self.script += "--pe1-1 %s \\\n\t" % self.sample_data[
                        sample]["fastq"]["readsF"]
                    self.script += "--pe1-2 %s \n\n" % self.sample_data[
                        sample]["fastq"]["readsR"]
                elif "SE" in self.sample_data[sample]["type"]:
                    self.script += "--s1 %s \n\n" % self.sample_data[sample][
                        "fastq"]["readsS"]
                elif "PE" in self.sample_data[sample][
                        "type"] and "SE" in self.sample_data[sample][
                            "type"]:  # Mixed!!
                    self.script += "--pe1-1 %s \\\n\t" % self.sample_data[
                        sample]["fastq"]["readsF"]
                    self.script += "--pe1-2 %s \\\n\t" % self.sample_data[
                        sample]["fastq"]["readsR"]
                    self.script += "--s1 %s \n\n" % self.sample_data[sample][
                        "fastq"]["readsS"]
                else:
                    raise AssertionExcept(
                        "Strange type configuration for sample\n", sample)

                # For prokka compliance, you can request a truncation of the contig names
                # e.g. ">NODE_82_length_18610_cov_38.4999_ID_165" will be changed to ">NODE_82_length_18610"
                if "truncate_names" in self.params.keys():
                    self.script += """
# Truncating contig names for prokka compliance
cat %(contigs)s  | cut -f 1-2 -d '_' > %(shortnames)s
mv -f %(shortnames)s %(contigs)s \n\n""" % {
                        "contigs": sample_dir + "contigs.fasta",
                        "shortnames": sample_dir + "contigs.shortIDs.fasta"
                    }

                # Store results to fasta and assembly slots:
                self.sample_data[sample]["fasta"][
                    "nucl"] = sample_dir + "contigs.fasta"
                self.sample_data[sample]["assembly"]["spades_assembl"][
                    "contigs"] = sample_dir + "contigs.fasta"
                self.sample_data[sample]["assembly"]["spades_assembl"][
                    "scaffolds"] = sample_dir + "scaffolds.fasta"

                self.stamp_file(self.sample_data[sample]["assembly"]
                                ["spades_assembl"]["scaffolds"])
                self.stamp_file(self.sample_data[sample]["assembly"]
                                ["spades_assembl"]["contigs"])

                # Wrapping up function. Leave these lines at the end of every iteration:
                self.local_finish(
                    use_dir, sample_dir
                )  # Sees to copying local files to final destination (and other stuff)

                self.create_low_level_script()
コード例 #13
0
    def build_scripts(self):

        # Each iteration MUST DEFINE the following class variables:
        # spec_script_name
        # script
        for sample in self.sample_data[
                "samples"]:  # Getting list of samples out of samples_hash
            # General comment: If there is a parallel routine for each direction (forward, reverse), add this loop

            # Name of specific script:
            self.spec_script_name = "_".join([self.step, self.name, sample])

            # Init script itself
            self.script = ""

            # Make a dir for the current sample:
            sample_dir = self.make_folder_for_sample(sample)
            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(sample_dir)

            # Make dir in links_for_demult folder:
            if not os.path.isdir(self.links_dir + sample):
                os.makedirs(self.links_dir + sample)

            # Define a set of directions to use for the sample
            directions_to_use = set()
            if self.sample_data[sample]["type"] == "SE":
                directions_to_use |= {"fastq.S"}
            else:  # PE or mixed
                if self.params["join"].lower() == "join":
                    directions_to_use |= {"fastq.J"}
                if self.params["unjoined"].lower(
                ) == "forward":  # Options: forward, reverse, both or none
                    directions_to_use |= {
                        "fastq.F", "fastq.S"
                    }  # Assumption: in case of mixed PE and SE, if you want the forward then you want the single sequences too.
                elif self.params["unjoined"].lower(
                ) == "reverse":  # Options: forward, reverse, both or none
                    directions_to_use |= {
                        "fastq.R", "fastq.S"
                    }  # Assumption: in case of mixed PE and SE, if you want the reverse then you want the single sequences too.
                elif self.params["unjoined"].lower(
                ) == "both":  # Options: forward, reverse, both or none
                    directions_to_use |= {"fastq.F", "fastq.R", "fastq.S"}
                else:  # =="none"
                    if not directions_to_use:  # directions_to_use is empty!
                        raise RuntimeError(
                            "You can't pass 'none' to both 'join' and 'unjoined' parameters in step %s"
                            % self.name)

            # If no join is required then only make links:
            if self.sample_data[sample]["type"] == "SE" or self.params[
                    "join"].lower() == "none":

                # directions contains existing files that appear in the sets to the right of the "&"
                directions = set(
                    self.sample_data[sample].keys()) & directions_to_use

                for direction in directions:
                    # print STDERR "$direction, $sample-->".$samples_hash->{$name}->{$sample}->{$direction}."\n";
                    link_name = "".join([
                        self.links_dir + sample + os.sep,
                        ".".join([sample, direction[6], "fastq"])
                    ])

                    if os.path.exists(link_name):
                        self.write_warning(
                            "Link $link_name exists. Will overwrite when script is executed!!!!\n"
                        )
                    cmd = "ln -sf %s %s" % (
                        self.sample_data[sample][direction], link_name)

                    self.script += cmd + "\n\n"

            else:
                # If join required, then there are 2steps: run join and link resuults in links4demult
                ############# 1
                # Add to $script joining code

                self.script += self.get_script_const(
                )  # Gets the "env", "script_path" and "redir_params" part of the script which is always the same...
                # if "env" in self.params.keys():         # Add optional environmental variables.
                # self.script += "env %s \\\n\t" % self.params["env"]
                # self.script += "%s \\\n\t" % self.params["script_path"]
                # for key in self.params["redir_params"].keys():
                # self.script += "%s %s \\\n\t" % (key,self.params["redir_params"][key])
                self.script += "-f " + self.sample_data[sample][
                    "fastq.F"] + " \\\n\t"
                self.script += "-r " + self.sample_data[sample][
                    "fastq.R"] + " \\\n\t"
                self.script += "-o " + use_dir + "\n\n"

                # self.sample_data[sample]["sample_dir"] = sample_dir

                ############# 2
                # Pointing to resulting files in sample structure:
                if self.params["join_algo"] == "fastq-join":
                    self.sample_data[sample][
                        "fastq.J"] = sample_dir + "fastqjoin.join.fastq"
                    self.sample_data[sample][
                        "fastq.F"] = sample_dir + "fastqjoin.un1.fastq"
                    self.sample_data[sample][
                        "fastq.R"] = sample_dir + "fastqjoin.un2.fastq"

                elif self.params["join_algo"] == "SeqPrep":
                    # Define the file names for SeqPrep. See qiime documentation
                    raise AssertionExcept("SeqPrep is not yet defined...\n")

                else:
                    raise AssertionExcept(
                        "You must define a join_algo. Either fastq-join or SeqPrep...\n"
                    )

                    ############# 3
                    # Leave space to define concatenation of R and F files:
                if self.params["join"] == "join_cat":
                    # Define qiime files as readsS (+ joined + catted)
                    raise AssertionExcept("join_cat not defined yet!!!\n")

                ############# 4
                # Putting links to final files
                if self.params["join"] == "join":
                    #### 1aii. Making soft links (checking each to make sure it does not already exist:)
                    directions = set(
                        self.sample_data[sample].keys()) & directions_to_use
                    for direction in directions:
                        link_name = "".join([
                            self.links_dir + sample + os.sep,
                            ".".join([sample, direction[6], "fastq"])
                        ])
                        if os.path.exists(link_name):
                            self.write_warning(
                                "Link " + link_name +
                                "exists. Will overwrite when script is executed!!!!\n"
                            )

                        cmd = "ln -sf %s %s" % (
                            self.sample_data[sample][direction], link_name)
                        self.script += cmd + "\n\n"

                elif self.params["join"] == "join_cat":
                    # Define what to do if we want join + cat.
                    raise AssertionExcept("join_cat not yet defined!!\n")

            self.sample_data["qiime.prep_links_dir"] = self.links_dir

            # Move all files from temporary local dir to permanent base_dir
            self.local_finish(
                use_dir, sample_dir
            )  # Sees to copying local files to final destination (and other stuff)

            # self.stamp_dir_files(sample_dir)

            self.create_low_level_script()
コード例 #14
0
def get_global_Sample_data(self, category):
    try:
        return reduce(dict.get, category, self.sample_data)
    except KeyError:
        raise AssertionExcept("The Slot %s is not found in sample data" %
                              str(category).replace(",", ""))
コード例 #15
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """
        GFF_dir = get_global_Sample_data(self, ["GFF_dir"])
        sample = 'Pan_Genome'
        # Name of specific script:
        self.spec_script_name = "_".join([self.step, self.name, sample])
        self.script = ""

        # Make a dir for the RESULTS:
        sample_dir = self.make_folder_for_sample(sample)

        # This line should be left before every new script. It sees to local issues.
        # Use the dir it returns as the base_dir for this step.
        #use_dir = self.local_start(sample_dir)

        # Define output filename
        output_filename = "".join([sample_dir, sample])
        #Roary main command
        self.script += self.get_script_const()
        self.script += " -f %s \\\n\t" % output_filename
        self.script += " %s*.gff \n\n" % GFF_dir

        # Adding the results data
        set_global_Sample_data(self, ["pan_genome_results_dir"],
                               output_filename)
        set_global_Sample_data(
            self, ["presence_absence_matrix"],
            os.path.join(output_filename, "gene_presence_absence.csv"))
        set_global_Sample_data(
            self, ["pan_genome_clustered_proteins"],
            os.path.join(output_filename, "clustered_proteins"))

        # Creating the plots
        if "plot" in self.params.keys():
            if "Roary_matrix_plot.py" in os.listdir(self.module_location):
                self.script += "env %s  \\\n" % self.params["env"]
                self.script += "python  %s \\\n\t" % os.path.join(
                    self.module_location, "Roary_matrix_plot.py")
                if type(self.params["plot"]) == dict:
                    if "format" in self.params["plot"].keys():
                        self.script += " --format %s \\\n\t" % self.params[
                            "plot"]["format"]
                    if "virulence_resistance_tag" in self.params.keys():
                        if self.params["virulence_resistance_tag"] == "VFDB":
                            self.script += " --tag %s \\\n\t" % "Virulence_Resistance.fasta:Virulence"
                        else:
                            self.script += " --tag %s \\\n\t" % self.params[
                                "virulence_resistance_tag"]
                    if "Clustering_method" in self.params["plot"].keys():
                        self.script += " -C %s \\\n\t" % self.params["plot"][
                            "Clustering_method"]
                self.script += " -O %s \\\n\t" % get_global_Sample_data(
                    self, ["pan_genome_results_dir"])
                self.script += " -P %s \n\n" % get_global_Sample_data(
                    self, ["presence_absence_matrix"])
            else:
                raise AssertionExcept(
                    "The file %s is not found in the Roary module directory" %
                    "Roary_matrix_plot.py")

        # Pan-genome wide association studies using scoary
        scoary_traits_file = ''
        gene_presence_absence_file_loc = get_global_Sample_data(
            self, ["presence_absence_matrix"])
        if "scoary" in self.params.keys():
            if type(self.params["scoary"]) == dict:
                if self.params["scoary"]["script_path"] != None:
                    # if the traits file is provided use it
                    if "traits_file" in self.params["scoary"].keys():
                        scoary_traits_file = self.params["scoary"][
                            "traits_file"]
                        # Creating the result dir
                        GWAS_dir = self.make_folder_for_sample("GWAS")
                    # if the a metadata file is provided use it to create traits file
                    elif "metadata_file" in self.params["scoary"].keys():
                        if "Traits_Parser.py" in os.listdir(
                                self.module_location):
                            if "traits_to_pars" in self.params["scoary"].keys(
                            ):
                                # Creating the result dir
                                GWAS_dir = self.make_folder_for_sample("GWAS")
                                self.script += "env %s  \\\n" % self.params[
                                    "env"]
                                self.script += "python %s \\\n\t" % os.path.join(
                                    self.module_location, "Traits_Parser.py")
                                self.script += " -M %s \\\n\t" % self.params[
                                    "scoary"]["metadata_file"]
                                self.script += " -O %s \\\n\t" % GWAS_dir
                                # This option will create new gene presence absence file with correct samples names and the are shared with the traits file
                                self.script += " -P %s \\\n\t" % gene_presence_absence_file_loc
                                if "metadata_samples_ID_field" in self.params[
                                        "scoary"].keys():
                                    self.script += " --S_MetaData %s \\\n\t" % self.params[
                                        "scoary"]["metadata_samples_ID_field"]
                                self.script += " --Fields_val %s \n\n" % self.params[
                                    "scoary"]["traits_to_pars"]
                                scoary_traits_file = os.path.join(
                                    GWAS_dir, 'Traits_file.csv')
                                # The new gene presence absence file is in the GWAS dir and it is the input for scoary
                                gene_presence_absence_file_loc = os.path.join(
                                    GWAS_dir, "gene_presence_absence.csv")
                        else:
                            raise AssertionExcept(
                                "The file %s is not found in the Roary module directory"
                                % "Traits_Parser.py")
                    if len(scoary_traits_file) > 0:
                        self.script += "env %s  \\\n" % self.params["env"]
                        self.script += "%s \\\n\t" % self.params["scoary"][
                            "script_path"]
                        self.script += " -o %s \\\n\t" % GWAS_dir
                        self.script += " -g %s \\\n\t" % gene_presence_absence_file_loc
                        self.script += " -t %s \\\n\t" % scoary_traits_file
                        if ("use_cluster_tree" in self.params["scoary"].keys()
                            ) & ("plot" in self.params.keys()):
                            self.script += " -n %s \\\n\t" % os.path.join(
                                get_global_Sample_data(
                                    self, ["pan_genome_results_dir"]),
                                "pangenome_matrix.newick")
                        else:
                            self.script += " -u  \\\n\t"
                        if "Bonferroni_cutoff" in self.params["scoary"].keys():
                            self.script += " -c B -p %s \\\n\t" % self.params[
                                "scoary"]["Bonferroni_cutoff"]
                        elif "BH_cutoff" in self.params["scoary"].keys():
                            self.script += " -c BH -p %s \\\n\t" % self.params[
                                "scoary"]["BH_cutoff"]
                        if "permutations" in self.params["scoary"].keys():
                            self.script += " -e %s  \n\n" % self.params[
                                "scoary"]["permutations"]
                        # Adding the results data
                        set_global_Sample_data(self, ["GWAS_results_dir"],
                                               GWAS_dir)
        self.script += " \n\n"

        if "Bi-cluster" in self.params.keys():
            if "Biclustering.R" in os.listdir(self.module_location):
                gene_presence_absence_file_loc = get_global_Sample_data(
                    self, ["presence_absence_matrix"])
                # Make a dir for the results file:
                bicluster_results_dir = self.make_folder_for_sample(
                    "Bicluster")
                #Running the bicluster script
                self.script += "env %s  \\\n" % self.params["env"]
                self.script += "Rscript  %s \\\n\t" % os.path.join(
                    self.module_location, "Biclustering.R")
                temp_self_script = ""
                if type(self.params["Bi-cluster"]) == dict:
                    for par in self.params["Bi-cluster"].keys():
                        if par == "--Roary_Results":
                            self.write_warning(
                                "The '--Roary_Results' parameter in the Roary Bi-clustering analysis is ignored"
                            )
                        elif par == "-o":
                            self.write_warning(
                                "The '-o' parameter in the Roary Bi-clustering analysis is ignored"
                            )
                        elif par == "--Annotation":
                            if self.params["Bi-cluster"][par] == "VFDB":
                                if "VFDB_unified_VF_category_clustered.tsv" in os.listdir(
                                        self.module_location):
                                    temp_self_script +="%s  %%s \\\n\t" % par \
                                                                        % os.path.join(self.module_location,"VFDB_unified_VF_category_clustered.tsv")
                                else:
                                    raise AssertionExcept(
                                        "The file %s is not found in the Roary module directory"
                                        %
                                        "VFDB_unified_VF_category_clustered.tsv"
                                    )
                            else:
                                temp_self_script +="%s  %%s \\\n\t" % par \
                                                               % self.params["Bi-cluster"][par]
                        elif len(par) > 0:
                            if self.params["Bi-cluster"][par] != None:
                                temp_self_script +="%s  %%s \\\n\t" % par \
                                                               % self.params["Bi-cluster"][par]
                            else:
                                temp_self_script += "%s  \\\n\t" % par

                self.script += "--Roary_Results %s  \\\n\t" % gene_presence_absence_file_loc
                self.script += temp_self_script
                self.script += "-o %s  \\\n\t" % bicluster_results_dir
                self.script += " \n\n"
                set_global_Sample_data(self, ["Bicluster_results_dir"],
                                       bicluster_results_dir)
                set_global_Sample_data(
                    self, ["Bicluster_clusters"],
                    os.path.join(bicluster_results_dir, "Bicluster_clusters"))

                # Run Gecko gene clusters analysis based on the Bi-clustering analysis
                if "Gecko" in self.params.keys():
                    if type(self.params["Gecko"]) == dict:
                        if "script_path" in self.params["Gecko"].keys():
                            if self.params["Gecko"]["script_path"] != None:
                                if "GFF2Gecko3.py" in os.listdir(
                                        self.module_location):
                                    Bicluster_clusters = get_global_Sample_data(
                                        self, ["Bicluster_clusters"])
                                    gene_presence_absence_file_loc = get_global_Sample_data(
                                        self, ["presence_absence_matrix"])
                                    # Make a dir for the results file:
                                    Gecko_results_dir = self.make_folder_for_sample(
                                        "Gecko")
                                    #Running the GFF2Gecko3 script
                                    self.script += "env %s  \\\n" % self.params[
                                        "env"]
                                    self.script += "python  %s \\\n\t" % os.path.join(
                                        self.module_location, "GFF2Gecko3.py")
                                    if "-p" in self.params["redir_params"]:
                                        self.script += "-P  %s \\\n\t" % self.params[
                                            "redir_params"]["-p"]
                                    self.script += "-D  %s \\\n\t" % GFF_dir
                                    self.script += "-C  %s \\\n\t" % gene_presence_absence_file_loc
                                    self.script += "-B  %s \\\n\t" % Bicluster_clusters
                                    self.script += "-o  %s \n\n" % os.path.join(
                                        Gecko_results_dir, "Gecko.cog")

                                    self.script += "env %s  \\\n" % self.params[
                                        "env"]
                                    temp_self_script = ""
                                    Gecko_pars = list()
                                    for par in self.params["Gecko"].keys():
                                        Gecko_pars.append(par)
                                        if par == "-in":
                                            self.write_warning(
                                                "The '-in' parameter in the Roary Gecko analysis is ignored"
                                            )
                                        elif par == "-out":
                                            self.write_warning(
                                                "The '-out' parameter in the Roary Gecko analysis is ignored"
                                            )
                                        elif len(par) > 0:
                                            if self.params["Gecko"][
                                                    par] != None:
                                                temp_self_script +="%s  %%s \\\n\t" % par \
                                                                               % self.params["Gecko"][par]
                                            else:
                                                temp_self_script += "%s  \\\n\t" % par

                                    self.script += "%s  \\\n\t" % self.params[
                                        "Gecko"]["script_path"]

                                    if "-r" not in Gecko_pars:
                                        temp_self_script += "-r Reference_clusters \\\n\t"
                                    if "-s" not in Gecko_pars:
                                        temp_self_script += "-s 2 \\\n\t"
                                    if "-d" not in Gecko_pars:
                                        temp_self_script += "-d 7 \\\n\t"
                                    if "-q" not in Gecko_pars:
                                        temp_self_script += "-q 2 \\\n\t"
                                    if "-rO" not in Gecko_pars:
                                        temp_self_script += "-rO zippedPdfs showFiltered %s \\\n\t" % os.path.join(
                                            Gecko_results_dir, "Clusters.zip")
                                    else:
                                        temp_self_script +="-rO %s %%s \\\n\t" %  self.params["Gecko"]["-rO"]\
                                                                               %  os.path.join(Gecko_results_dir ,"Clusters" )
                                    self.script += "-in %s  \\\n\t" % os.path.join(
                                        Gecko_results_dir, "Gecko.cog")
                                    self.script += "-out %s  \\\n\t" % os.path.join(
                                        Gecko_results_dir, "Gecko.gck")
                                    self.script += temp_self_script
                                    self.script += " \n\n"
                                    set_global_Sample_data(
                                        self, ["Gecko_results_dir"],
                                        Gecko_results_dir)
                                else:
                                    raise AssertionExcept(
                                        "The file %s is not found in the Roary module directory"
                                        % "GFF2Gecko3.py")
                            else:
                                raise AssertionExcept(
                                    "No %s running command found" % "Gecko")
            else:
                raise AssertionExcept(
                    "The file %s is not found in the Roary module directory" %
                    "Biclustering.R")

        for sample in self.sample_data[
                "samples"]:  # Getting list of samples out of samples_hash
            # Store Roary result location:
            set_Sample_data(
                self, sample, ["pan_genome_results_dir"],
                get_global_Sample_data(self, ["pan_genome_results_dir"]))

        # Wrapping up function. Leave these lines at the end of every iteration:
        #self.local_finish(use_dir,sample_dir)       # Sees to copying local files to final destination (and other stuff)

        self.create_low_level_script()
コード例 #16
0
    def step_sample_initiation(self):
        """ A place to do initiation stages following setting of sample_data
        """

        for sample in self.sample_data[
                "samples"]:  # Getting list of samples out of samples_hash

            if "sam" in self.sample_data[sample]:
                self.write_warning(
                    "SAM file exists for sample. Double mapping steps?\n",
                    sample)

            if self.params["mod"] in ["samse"]:
                try:
                    self.sample_data[sample]["saiS"]
                    self.sample_data[sample]["fastq.S"]
                except KeyError:
                    raise AssertionExcept(
                        "'samse' requires sai and single-end fatsq files for the sample. Make sure you have a bwa aln step before this step and 'Single' files in the sample file.",
                        sample)
            if self.params["mod"] in ["sampe"]:
                try:
                    self.sample_data[sample]["saiF"]
                    self.sample_data[sample]["saiR"]
                    self.sample_data[sample]["fastq.F"]
                    self.sample_data[sample]["fastq.R"]

                except KeyError:
                    raise AssertionExcept(
                        "'sampe' requires sai and paired-end fatsq files for the sample. Make sure you have a bwa aln step before this step and 'Forward' and 'Reverse' files in the sample file.",
                        sample)

        # Require either 'scope' or 'ref_index':
        if "scope" in self.params:
            # If scope defined, comment if also ref_index exists.
            if "ref_index" in self.params:
                raise AssertionExcept(
                    "Both 'scope' and 'ref_index' specified!\n")

            try:
                # Loop over samples to set the reference genome:
                for sample in self.sample_data["samples"]:
                    if self.params["scope"] == "project":
                        # Set project wide reference:
                        self.sample_data[sample][
                            "reference"] = self.sample_data["bwa_fasta"]
                    elif self.params["scope"] == "sample":
                        # Set per-sample reference:
                        self.sample_data[sample][
                            "reference"] = self.sample_data[sample][
                                "bwa_fasta"]
                    else:
                        raise AssertionExcept(
                            "Scope must be either 'sample' or 'project'")

            except KeyError:

                raise AssertionExcept(
                    "There is a mismatch between 'scope' and the existing bwa index\n",
                    sample)

            if "ref_genome" in self.params.keys():
                raise AssertionExcept(
                    "ref_genome was passed, and 'scope' was defined. Ignoring ref_genome\n"
                )
        else:
            # If scope is not defined, require '-x'
            if not "ref_index" in self.params:
                raise AssertionExcept(
                    "Neither 'scope' nor 'ref_index' specified.\n")
            # Storing reference genome for use by downstream steps:
            if "ref_genome" in self.params.keys():
                for sample in self.sample_data["samples"]:
                    # If reference already exists, ignore ref_genome
                    if "reference" in self.sample_data[sample]:
                        self.write_warning(
                            "ref_genome was passed, but a reference already exists. Setting reference to 'ref_genome'\n"
                        )

                    self.sample_data[sample]["reference"] = self.params[
                        "ref_genome"]
            else:
                self.write_warning(
                    "No reference given. It is highly recommended to give one!\n"
                )
コード例 #17
0
    def step_sample_initiation(self):
        """ A place to do initiation stages following setting of sample_data
        """

        ##########################################

        # Require either 'scope' or '--genomeDir':
        if "scope" in self.params:
            # If scope defined, comment if also -x exists.
            if "--genomeDir" in self.params["redir_params"]:
                raise AssertionExcept(
                    "Both 'scope' and '--genomeDir' specified!\n")

            # Loop over samples to set the reference genome:
            for sample in self.sample_data["samples"]:
                if self.params["scope"] == "project":
                    # Set project wide reference:
                    try:
                        self.sample_data[sample][
                            "reference"] = self.sample_data["STAR_fasta"]
                    except:
                        raise AssertionExcept(
                            "No reference exists at 'project' scope. Do you have a STAR_builder step defined?"
                        )
                elif self.params["scope"] == "sample":
                    # Set per-sample reference:
                    try:
                        self.sample_data[sample][
                            "reference"] = self.sample_data[sample][
                                "STAR_fasta"]
                    except:
                        raise AssertionExcept(
                            "No reference exists at 'sample' scope. Do you have a STAR_builder step defined?",
                            sample)
                else:
                    raise AssertionExcept(
                        "Scope must be either 'sample' or 'project'")

            if "ref_genome" in self.params.keys():
                raise AssertionExcept(
                    "ref_genome was passed, and 'scope' was defined. Resolve!\n"
                )
        else:
            # If scope is not defined, require '--genomeDir'
            if not "--genomeDir" in self.params["redir_params"]:
                raise AssertionExcept(
                    "Neither 'scope' nor '--genomeDir' specified.\n")
            # Storing reference genome for use by downstream steps:
            if "ref_genome" in self.params.keys():
                for sample in self.sample_data["samples"]:
                    # If reference already exists, ignore ref_genome
                    if "reference" in self.sample_data[sample]:
                        self.write_warning(
                            "ref_genome was passed, but a reference already exists. Setting reference to 'ref_genome'\n"
                        )

                    self.sample_data[sample]["reference"] = self.params[
                        "ref_genome"]
            else:
                self.write_warning(
                    "No reference given. It is highly recommended to give one!\n"
                )
コード例 #18
0
    def step_sample_initiation_byproject(self):
        """ A place to do initiation stages following setting of sample_data
            This set of tests is performed for project-level 
        """
        if len(get_File_Type_data(self.params, ["inputs"])) > 0:
            # Test if the input File_Types exists
            for inputs in self.params["inputs"].keys():
                if get_File_Type_data(
                        self.params["inputs"], [inputs, "File_Type"], None
                ) == None:  #Test if the user specify a File_Type for the input argument
                    raise AssertionExcept(
                        "You mast specify a File_Type argument in the input parameter: %s "
                        % inputs)
                else:  #Test if the File_Type for the input argument exists
                    if get_File_Type_data(self.params["inputs"],
                                          [inputs, "scope"]) == "project":
                        if get_File_Type_data(
                                self.params["inputs"],
                            [inputs, "File_Type"
                             ]) not in self.sample_data.keys():
                            raise AssertionExcept(
                                "The File_Type %s is not found in the PROJECT level \n\t File_Types available are : %%s"
                                % get_File_Type_data(self.params["inputs"],
                                                     [inputs, "File_Type"]) %
                                self.sample_data.keys())
                    else:
                        for sample in self.sample_data["samples"]:
                            if get_File_Type_data(
                                    self.params["inputs"],
                                [inputs, "File_Type"
                                 ]) not in self.sample_data[sample].keys():
                                raise AssertionExcept(
                                    "The File_Type %s is not found in the SAMPLE level [in sample name %%s] \n\t File_Types available are : %%%%s"
                                    %
                                    get_File_Type_data(self.params["inputs"],
                                                       [inputs, "File_Type"]) %
                                    sample % self.sample_data[sample].keys())
                if "del" in self.params["inputs"][inputs].keys():
                    self.write_warning(
                        "!!! The file/directory in the input File_Type %s will be DELETED at the end of this step!!! "
                        % get_File_Type_data(self.params["inputs"],
                                             [inputs, "File_Type"]))

        if len(get_File_Type_data(self.params, ["outputs"])) > 0:
            # Test if the output File_Types
            for outputs in self.params["outputs"].keys():
                if get_File_Type_data(
                        self.params["outputs"], [outputs, "File_Type"], None
                ) != None:  #Test if the user specify a File_Type for the output argument
                    if get_File_Type_data(self.params["outputs"], [
                            outputs, "File_Type"
                    ]) in self.sample_data.keys(
                    ):  #Test if the File_Type for the output argument exists
                        if self.sample_data[get_File_Type_data(
                                self.params["outputs"], [outputs, "File_Type"]
                        )] == None:  #Test if the File_Type was already defined in the output arguments
                            raise AssertionExcept(
                                "The output File_Type %s in the PROJECT level was defined more the once !!! "
                                % get_File_Type_data(self.params["outputs"],
                                                     [outputs, "File_Type"]))
                        else:
                            self.write_warning(
                                "The output File_Type %s already exists in the PROJECT level, it's content will be override !!! "
                                % get_File_Type_data(self.params["outputs"],
                                                     [outputs, "File_Type"]))
                    else:  # If the File_Type dose not exists, will generate empty File_Type
                        self.sample_data[get_File_Type_data(
                            self.params["outputs"],
                            [outputs, "File_Type"])] = None
                if "del" in self.params["outputs"][outputs].keys():
                    raise AssertionExcept(
                        "Output File_Types cannot be deleted")
        pass
コード例 #19
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """

        # Each iteration must define the following class variables:
        # self.spec_script_name
        # self.script

        for sample in self.sample_data[
                "samples"]:  # Getting list of samples out of samples_hash

            # Make a dir for the current sample:
            sample_dir = self.make_folder_for_sample(sample)

            # Name of specific script:
            self.spec_script_name = "_".join([self.step, self.name, sample])
            self.script = ""

            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(sample_dir)

            # Define location and prefix for output files:
            output_prefix = sample + "_STAR_map"
            # output_prefix = use_dir + output_prefix

            # Adding sample ID to ID: attribute:
            if "outSAMattrRGline" in self.params:
                self.params["redir_params"]["--outSAMattrRGline"] = "ID:{ID} {rest}".format(ID=sample, \
                                                                                rest=self.params["outSAMattrRGline"])
            else:
                self.params["redir_params"][
                    "--outSAMattrRGline"] = "ID:{ID}".format(ID=sample)

            # If using internal index, define it here:
            if "scope" in self.params:
                if self.params["scope"] == "sample":
                    self.params["redir_params"][
                        "--genomeDir"] = self.sample_data[sample]["STAR_index"]
                else:
                    self.params["redir_params"][
                        "--genomeDir"] = self.sample_data["STAR_index"]

            # Get constant part of script:
            self.script += self.get_script_const()

            if "fastq.F" in self.sample_data[sample]:
                self.script += "--readFilesIn %s %s \\\n\t" % (
                    self.sample_data[sample]["fastq.F"],
                    self.sample_data[sample]["fastq.R"])
            elif "fastq.S" in self.sample_data[sample]:
                self.script += "--readFilesIn %s \\\n\t" % self.sample_data[
                    sample]["fastq.S"]
            else:
                raise AssertionExcept("No fastq files exist for sample!!\n",
                                      sample)

            self.script += "--outFileNamePrefix %s%s. \n\n" % (use_dir,
                                                               output_prefix)

            if self.output_type == "SAM":
                self.sample_data[sample]["sam"] = "%s%s.Aligned.out.sam" % (
                    sample_dir, output_prefix)
                self.stamp_file(self.sample_data[sample]["sam"])
            elif self.output_type == "BAM":
                if "Unsorted" in self.bam_types:
                    self.sample_data[sample][
                        "bam"] = "%s%s.Aligned.out.bam" % (sample_dir,
                                                           output_prefix)
                    self.sample_data[sample][
                        "bam_unsorted"] = "%s%s.Aligned.out.bam" % (
                            sample_dir, output_prefix)
                    self.stamp_file(self.sample_data[sample]["bam_unsorted"])
                if "SortedByCoordinate" in self.bam_types:
                    self.sample_data[sample][
                        "bam"] = "%s%s.Aligned.sortedByCoord.out.bam" % (
                            sample_dir, output_prefix)
                self.stamp_file(self.sample_data[sample]["bam"])
            else:  # None
                pass

            if self.wig_type == "bedGraph":
                if "--outWigStrand" not in self.params[
                        "redir_params"] or self.params["redir_params"][
                            "--outWigStrand"] == "Stranded":
                    self.sample_data[sample][
                        "bdg2_UniqueMultiple"] = "%s%s.Signal.UniqueMultiple.str2.out.bg" % (
                            sample_dir, output_prefix)
                    self.sample_data[sample][
                        "bdg2_Unique"] = "%s%s.Signal.Unique.str2.out.bg" % (
                            sample_dir, output_prefix)
                    self.stamp_file(
                        self.sample_data[sample]["bdg2_UniqueMultiple"])
                    self.stamp_file(self.sample_data[sample]["bdg2_Unique"])
                self.sample_data[sample][
                    "bdg1_UniqueMultiple"] = "%s%s.Signal.UniqueMultiple.str1.out.bg" % (
                        sample_dir, output_prefix)
                self.sample_data[sample][
                    "bdg1_Unique"] = "%s%s.Signal.Unique.str1.out.bg" % (
                        sample_dir, output_prefix)
                self.stamp_file(
                    self.sample_data[sample]["bdg1_UniqueMultiple"])
                self.stamp_file(self.sample_data[sample]["bdg1_Unique"])
                self.sample_data[sample]["bdg"] = self.sample_data[sample][
                    "bdg1_UniqueMultiple"]
            elif self.wig_type == "wiggle":
                if "--outWigStrand" not in self.params[
                        "redir_params"] or self.params["redir_params"][
                            "--outWigStrand"] == "Stranded":
                    self.sample_data[sample][
                        "wig2_UniqueMultiple"] = "%s%s.Signal.UniqueMultiple.str2.out.wig" % (
                            sample_dir, output_prefix)
                    self.sample_data[sample][
                        "wig2_Unique"] = "%s%s.Signal.Unique.str2.out.wig" % (
                            sample_dir, output_prefix)
                    self.stamp_file(
                        self.sample_data[sample]["wig2_UniqueMultiple"])
                    self.stamp_file(self.sample_data[sample]["wig2_Unique"])
                self.sample_data[sample][
                    "wig1_UniqueMultiple"] = "%s%s.Signal.UniqueMultiple.str1.out.wig" % (
                        sample_dir, output_prefix)
                self.sample_data[sample][
                    "wig1_Unique"] = "%s%s.Signal.Unique.str1.out.wig" % (
                        sample_dir, output_prefix)
                self.stamp_file(
                    self.sample_data[sample]["wig1_UniqueMultiple"])
                self.stamp_file(self.sample_data[sample]["wig1_Unique"])
                self.sample_data[sample]["wig"] = self.sample_data[sample][
                    "wig1_UniqueMultiple"]
            else:
                pass

            if "--quantMode" in self.params["redir_params"]:
                if self.params["redir_params"][
                        "--quantMode"] == "TranscriptomeSAM":
                    self.sample_data[sample][
                        "bam_transcriptome"] = "%s%s.Aligned.toTranscriptome.out.bam" % (
                            sample_dir, output_prefix)

                if self.params["redir_params"]["--quantMode"] == "GeneCounts":
                    pass  # Not supported yet...

            # Storing name of mapper. might be useful:
            self.sample_data[sample]["mapper"] = self.get_step_step()

            # Storing reference genome for use by downstream steps:
            if "ref_genome" in self.params.keys():
                self.sample_data[sample]["reference"] = self.params[
                    "ref_genome"]

            # Move all files from temporary local dir to permanent base_dir
            self.local_finish(
                use_dir, self.base_dir
            )  # Sees to copying local files to final destination (and other stuff)

            self.create_low_level_script()
コード例 #20
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """

        # Each iteration must define the following class variables:
        # self.spec_script_name
        # self.script

        if self.params["scope"] == "project":
            # Name of specific script:
            self.set_spec_script_name(
            )  #"_".join([self.step,self.name,self.sample_data["Title"]])
            self.script = ""

            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(self.base_dir)

            # Define location and prefix for output files:
            # output_prefix = sample + "_bowtie2_map"

            # Get list of reference fasta files from samples, and convert to set, removing duplicates
            reference_fasta = set([
                self.sample_data[sample]["reference"]
                for sample in self.sample_data["samples"]
            ])
            # If there are more than one reference_fasta, exit. This is really really weird and should not happen
            if len(reference_fasta) > 1:
                raise AssertionExcept(
                    "There is more than one reference file for the samples. Weird!!!"
                )

            # Convert set into list and return first, and only, element:
            reference_fasta = list(reference_fasta)[0]

            # Get constant part of script:
            self.script += self.get_script_const()
            # Reference file:
            self.script += "-f %s \\\n\t" % reference_fasta
            # BAM files:
            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash
                self.script += "-b %s \\\n\t" % self.sample_data[sample]["bam"]

            if self.params["output_type"] == "vcf":
                self.script += "--vcf %s%s_%s.vcf \n\n" % (
                    use_dir, self.sample_data["Title"], self.get_step_name())

                self.sample_data["vcf"] = "%s%s_%s.vcf" % (
                    self.base_dir, self.sample_data["Title"],
                    self.get_step_name())
                self.sample_data["vcf.source"] = "freebayes"
                self.stamp_file(self.sample_data["vcf"])
            else:  # output_type = "gvcf"
                self.script += "--gvcf %s%s_%s.gvcf \n\n" % (
                    use_dir, self.sample_data["Title"], self.get_step_name())

                self.sample_data["gvcf"] = "%s%s_%s.gvcf" % (
                    self.base_dir, self.sample_data["Title"],
                    self.get_step_name())
                self.sample_data["gvcf.source"] = "freebayes"
                self.stamp_file(self.sample_data["gvcf"])

            # Move all files from temporary local dir to permanent base_dir
            self.local_finish(
                use_dir, self.base_dir
            )  # Sees to copying local files to final destination (and other stuff)

            self.create_low_level_script()

        else:
            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash

                # Make a dir for the current sample:
                sample_dir = self.make_folder_for_sample(sample)

                # Name of specific script:
                self.set_spec_script_name(
                    sample=True)  #"_".join([self.step,self.name,sample])
                self.script = ""

                # This line should be left before every new script. It sees to local issues.
                # Use the dir it returns as the base_dir for this step.
                use_dir = self.local_start(sample_dir)

                # Get list of reference fasta files from samples, and convert to set, removing duplicates
                reference_fasta = self.sample_data[sample]["reference"]

                # Get constant part of script:
                self.script += self.get_script_const()
                # Reference file:
                self.script += "-f %s \\\n\t" % reference_fasta
                # BAM files:
                self.script += "-b %s \\\n\t" % self.sample_data[sample]["bam"]

                if self.params["output_type"] == "vcf":
                    self.script += "--vcf %s%s_%s.vcf \n\n" % (
                        use_dir, sample, self.get_step_name())
                    self.sample_data[sample]["vcf"] = "%s%s_%s.vcf" % (
                        sample_dir, sample, self.get_step_name())
                    self.sample_data[sample]["vcf.source"] = "freebayes"
                    self.stamp_file(self.sample_data[sample]["vcf"])
                else:  # output_type = "gvcf"
                    self.script += "--gvcf %s%s_%s.gvcf \n\n" % (
                        use_dir, sample, self.get_step_name())
                    self.sample_data[sample]["gvcf"] = "%s%s_%s.gvcf" % (
                        sample_dir, sample, self.get_step_name())
                    self.sample_data[sample]["gvcf.source"] = "freebayes"
                    self.stamp_file(self.sample_data[sample]["gvcf"])

                # Move all files from temporary local dir to permanent base_dir
                self.local_finish(
                    use_dir, sample_dir
                )  # Sees to copying local files to final destination (and other stuff)

                self.create_low_level_script()
コード例 #21
0
    def build_scripts(self):

        if self.params["scope"] == "project":
            # Name of specific script:
            self.spec_script_name = "_".join(
                [self.step, self.name, self.sample_data["Title"]])
            self.script = ""

            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(self.base_dir)

            # Define output filename
            output_filename = "".join(
                [self.sample_data["Title"], self.file_tag])

            # Define query and db files:
            # If db is defined by user, set the query to the correct 'fasta2use'
            # If both nucl and prot appear in blast results
            if "blast.nucl" in self.sample_data and "blast.prot" in self.sample_data:
                if "fasta2use" in self.params.keys(
                ) and self.params["fasta2use"] in ("nucl", "prot"):
                    fasta2use = self.params["fasta2use"]
                    # self.script += "--blast %s \\\n\t" % self.sample_data[sample]["blast"][fasta2use]
                else:
                    raise AssertionExcept(
                        "Project has both 'nucl' and 'prot' blast results. Select one by specifying the 'fasta2use' parameter."
                    )
            elif "blast.nucl" in self.sample_data:
                fasta2use = "nucl"
            elif "blast.prot" in self.sample_data:
                fasta2use = "prot"
            else:
                raise AssertionExcept("No BLAST Results defined\n")

            self.script += self.get_script_const()
            self.script += "--blast %s \\\n\t" % self.sample_data["blast." +
                                                                  fasta2use]

            # FASTA Extraction
            if "extract_fasta" in self.params:
                try:
                    self.script += "--fasta2extract %s \\\n\t" % self.sample_data[
                        "fasta." + fasta2use]
                except keyError:
                    raise AssertionExcept(
                        "In order to extract the fasta sequences, you need to have a project wide fasta file defined with the same type as the blast type."
                    )

            self.script += "--output %s\n\n" % os.sep.join(
                [use_dir, output_filename])

            # Store BLAST result file:
            self.sample_data["blast.parsed"] = "".join(
                [self.base_dir, output_filename])
            self.sample_data["blast.parsed." +
                             fasta2use] = self.sample_data["blast.parsed"]
            self.stamp_file(self.sample_data["blast.parsed"])

            # Wrapping up function. Leave these lines at the end of every iteration:
            self.local_finish(
                use_dir, self.base_dir
            )  # Sees to copying local files to final destination (and other stuff)

            self.create_low_level_script()

        else:  # self.params["scope"]=="sample":
            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash

                # Name of specific script:
                self.spec_script_name = "_".join(
                    [self.step, self.name, sample])
                self.script = ""

                # Make a dir for the current sample:
                sample_dir = self.make_folder_for_sample(sample)

                # This line should be left before every new script. It sees to local issues.
                # Use the dir it returns as the base_dir for this step.
                use_dir = self.local_start(sample_dir)

                # Define output filename
                output_filename = "".join([use_dir, sample, self.file_tag])

                # Define query and db files:
                # If db is defined by user, set the query to the correct 'fasta2use'
                # If both nucl and prot appear in blast results
                if "blast.nucl" in self.sample_data[
                        sample] and "blast.prot" in self.sample_data[sample]:
                    if "fasta2use" in self.params.keys(
                    ) and self.params["fasta2use"] in ("nucl", "prot"):
                        fasta2use = self.params["fasta2use"]
                        # self.script += "--blast %s \\\n\t" % self.sample_data[sample]["blast"][fasta2use]
                    else:
                        raise AssertionExcept(
                            "Sample has both 'nucl' and 'prot' blast results. Select one by specifying the 'fasta2use' parameter.",
                            sample)
                elif "blast.nucl" in self.sample_data[sample]:
                    fasta2use = "nucl"
                elif "blast.prot" in self.sample_data[sample]:
                    fasta2use = "prot"
                else:
                    raise AssertionExcept("No BLAST Results defined\n")

                # Define the actual script:
                self.script += self.get_script_const()
                self.script += "--blast %s \\\n\t" % self.sample_data[sample][
                    "blast." + fasta2use]

                # FASTA Extraction
                if "extract_fasta" in self.params:
                    try:
                        self.script += "--fasta2extract %s \\\n\t" % self.sample_data[
                            sample]["fasta." + fasta2use]
                    except keyError:
                        raise AssertionExcept(
                            "In order to extract the fasta sequences, you need to have a fasta file defined with the same type as the blast type.",
                            sample)

                self.script += "--output %s\n\n" % output_filename

                # Store BLAST result file:
                self.sample_data[sample]["blast.parsed"] = "".join(
                    [sample_dir, sample, self.file_tag])
                self.sample_data[sample][
                    "blast.parsed." +
                    fasta2use] = self.sample_data[sample]["blast.parsed"]
                self.stamp_file(self.sample_data[sample]["blast.parsed"])

                # Wrapping up function. Leave these lines at the end of every iteration:
                self.local_finish(
                    use_dir, sample_dir
                )  # Sees to copying local files to final destination (and other stuff)

                self.create_low_level_script()
コード例 #22
0
 def step_specific_init(self):
     self.shell = "csh"      # Can be set to "bash" by inheriting instances
     self.file_tag = "trim_galore.fq"
     if "--output_dir" in self.params["redir_params"] or "-o" in self.params["redir_params"]:
         raise AssertionExcept("You should not give output directory\n")
コード例 #23
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """

        # Each iteration must define the following class variables:
        # self.spec_script_name
        # self.script
        if self.params["scope"] == "project":

            # Name of specific script:
            self.spec_script_name = "_".join(
                [self.step, self.name, self.sample_data["Title"]])
            self.script = ""

            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(self.base_dir)

            # Define location and prefix for output files:
            # output_prefix = sample + "_bowtie2_map"

            if self.type == "nucl":
                try:
                    input_file = self.sample_data["fasta.nucl"]
                except:
                    raise AssertionExcept(
                        "`nucl` fasta file does not exist at project scope. Did you mean cd-hit instead of cd-hit-est?"
                    )
            else:  # == "prot"
                try:
                    input_file = self.sample_data["fasta.prot"]
                except:
                    raise AssertionExcept(
                        "`prot` fasta file does not exist at project scope. Did you mean cd-hit-est instead of cd-hit?"
                    )

            output_prefix = os.path.basename(input_file)

            # Get constant part of script:
            self.script += self.get_script_const()
            self.script += "-i {infn} \\\n\t".format(infn=input_file)
            self.script += "-o {outdir}{ossep}{outfn} \n\n".format(
                outdir=use_dir, ossep=os.sep, outfn=output_prefix)

            self.sample_data["fasta." +
                             self.type] = "{outdir}{ossep}{outfn}".format(
                                 outdir=self.base_dir,
                                 ossep=os.sep,
                                 outfn=output_prefix)
            self.sample_data["cd_hit." +
                             self.type] = self.sample_data["fasta." +
                                                           self.type]
            self.stamp_file(self.sample_data["fasta." + self.type])

            # Move all files from temporary local dir to permanent base_dir
            self.local_finish(
                use_dir, self.base_dir
            )  # Sees to copying local files to final destination (and other stuff)

            self.create_low_level_script()

        else:
            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash

                # Make a dir for the current sample:
                sample_dir = self.make_folder_for_sample(sample)

                # Name of specific script:
                self.spec_script_name = "_".join(
                    [self.step, self.name, sample])
                self.script = ""

                # This line should be left before every new script. It sees to local issues.
                # Use the dir it returns as the base_dir for this step.
                use_dir = self.local_start(sample_dir)

                # Define location and prefix for output files:
                # output_prefix = sample + "_bowtie2_map"

                if self.type == "nucl":
                    try:
                        input_file = self.sample_data[sample]["fasta.nucl"]
                    except:
                        raise AssertionExcept(
                            "`nucl` fasta file does not exist at project scope. Did you mean cd-hit instead of cd-hit-est?"
                        )
                else:  # == "prot"
                    try:
                        input_file = self.sample_data[sample]["fasta.prot"]
                    except:
                        raise AssertionExcept(
                            "`prot` fasta file does not exist at project scope. Did you mean cd-hit-est instead of cd-hit?"
                        )

                output_prefix = os.path.basename(input_file)

                # Get constant part of script:
                self.script += self.get_script_const()
                self.script += "-i {infn} \\\n\t".format(infn=input_file)
                self.script += "-o {outdir}{ossep}{outfn} \n\n".format(
                    outdir=use_dir, ossep=os.sep, outfn=output_prefix)

                self.sample_data[sample][
                    "fasta." + self.type] = "{outdir}{ossep}{outfn}".format(
                        outdir=sample_dir, ossep=os.sep, outfn=output_prefix)
                self.sample_data[sample]["cd_hit." +
                                         self.type] = self.sample_data[sample][
                                             "fasta." + self.type]
                self.stamp_file(self.sample_data[sample]["fasta." + self.type])

                # Move all files from temporary local dir to permanent base_dir
                self.local_finish(
                    use_dir, sample_dir
                )  # Sees to copying local files to final destination (and other stuff)

                self.create_low_level_script()
コード例 #24
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """

        # Each iteration must define the following class variables:
        # spec_script_name
        # script
        if "generate_GFF_dir" in self.params.keys():
            #Make a dir for the GFF files:
            GFF_dir = self.make_folder_for_sample("GFF")
            self.sample_data["GFF_dir"] = GFF_dir

        for sample in self.sample_data[
                "samples"]:  # Getting list of samples out of samples_hash
            # Name of specific script:
            self.spec_script_name = "_".join([self.step, self.name, sample])
            self.script = ""

            # Make a dir for the current sample:
            sample_dir = self.make_folder_for_sample(sample)

            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(sample_dir)

            # Define output filename
            output_filename = "".join([use_dir, sample])

            self.script += self.get_script_const()
            if "--proteins VFDB" in self.script:
                if "Virulence_Resistance.fasta" in os.listdir(
                        self.module_location):
                    self.script = self.script.replace(
                        "--proteins VFDB", "--proteins %s" %
                        os.path.join(self.module_location,
                                     "Virulence_Resistance.fasta"))
                else:
                    raise AssertionExcept(
                        "The file %s is not found in the Prokka module directory"
                        % "Virulence_Resistance.fasta")

            self.script += "--outdir %s \\\n\t" % use_dir
            self.script += "--locustag %s \\\n\t" % sample
            self.script += "--strain %s \\\n\t" % sample
            self.script += "--prefix %s \\\n\t" % sample
            self.script += "%s \n\n" % self.sample_data[sample]["fasta.nucl"]
            if "generate_GFF_dir" in self.params.keys():
                self.script += "cp %s  %%s \n\n" % os.path.join(
                    sample_dir, sample + ".gff") % GFF_dir

            # Store Prokka result files:
            #set_Sample_data(self,sample,["GFF"],os.path.join(sample_dir,sample+".gff"))
            self.sample_data[sample]["GFF"] = os.path.join(
                sample_dir, sample + ".gff")
            self.sample_data[sample]["fasta.nucl"] = os.path.join(
                sample_dir, sample + ".ffn")
            self.sample_data[sample]["fasta.prot"] = os.path.join(
                sample_dir, sample + ".faa")
            # Wrapping up function. Leave these lines at the end of every iteration:
            self.local_finish(
                use_dir, sample_dir
            )  # Sees to copying local files to final destination (and other stuff)

            self.create_low_level_script()
コード例 #25
0
    def step_sample_initiation(self):
        """ A place to do initiation stages following setting of sample_data
        """

        if "scope" in self.params.keys():
            if self.params["scope"] == "project":
                try:  # Is there a mega-assembly?
                    self.sample_data["fasta"]["nucl"]
                except KeyError:  # No. Check if all samples have assemblies:
                    raise AssertionExcept("No project wide assembly!")
                else:
                    try:  # Creating an assembly slot in case it does not exist
                        # This can happen when running quast on a fasta file that was not assembled (=input)
                        self.sample_data["assembly"]
                    except KeyError:
                        self.sample_data["assembly"] = {}
                if "compare_mode" in self.params.keys():
                    self.write_warning(
                        "Ignoring 'compare_mode' in project scope")

            elif self.params["scope"] == "sample":
                for sample in self.sample_data[
                        "samples"]:  # Getting list of samples out of samples_hash

                    # Make sure each sample has a ["fasta"]["nucl"] slot
                    try:
                        self.sample_data[sample]["fasta"]["nucl"]
                        # self.sample_data["assembly"]  # Removed so that the step can be executed on fasta assembled elsewhere and loaded as fasta from sample file
                    except KeyError:
                        raise AssertionExcept(
                            "You are trying to run QUAST with no assembly.\n",
                            sample)
                    else:
                        try:  # Creating an assembly slot in case it does not exist
                            # This can happen when running quast on a fasta file that was not assembled (=input)
                            self.sample_data[sample]["assembly"]
                        except KeyError:
                            self.sample_data[sample]["assembly"] = {}
            else:
                raise AssertionExcept(
                    "'scope' must be either 'project' or 'sample'")

        else:
            self.write_warning("'scope' not passed. Will try guessing...")

            try:  # Is there a mega-assembly?
                self.sample_data["fasta"]["nucl"]
            except KeyError:  # No. Check if all samples have assemblies:
                for sample in self.sample_data[
                        "samples"]:  # Getting list of samples out of samples_hash

                    # Make sure each sample has a ["fasta"]["nucl"] slot
                    try:
                        self.sample_data[sample]["fasta"]["nucl"]
                        # self.sample_data["assembly"]  # Removed so that the step can be executed on fasta assembled elsewhere and loaded as fasta from sample file
                    except KeyError:
                        raise AssertionExcept(
                            "You are trying to run QUAST with no assembly.\n",
                            sample)

                self.params["scope"] = "sample"

            else:
                self.write_warning(
                    "There is a project-wide assembly. Using it.\n")

                try:  # Creating an assembly slot in case it does not exist
                    self.sample_data["assembly"]
                except KeyError:
                    self.sample_data["assembly"] = {}

                self.params["scope"] = "project"
コード例 #26
0
    def step_specific_init(self):
        self.shell = "csh"  # Can be set to "bash" by inheriting instances

        if not "genome" in self.params:
            raise AssertionExcept("You must pass a 'genome' parameter!")
コード例 #27
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """

        # Each iteration must define the following class variables:
            # self.spec_script_name
            # self.script
        for sample in self.sample_data["Controls"].keys():      # Getting list of samples out of Controls dict.

            # Make a dir for the current sample:
            sample_dir = self.make_folder_for_sample(sample)

            # Name of specific script:
            self.spec_script_name = "_".join([self.step,self.name,sample])
            self.script = ""

            # Name of control sample:
            control = self.sample_data["Controls"][sample]

            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(sample_dir)

            # Defined full path to output filename
            output_filename = "%s.%s" % (sample, self.file_tag)
            
                
                
            self.script += self.get_script_const()
                
            # Add lines for sample mapping files:
            self.script += "-t %s \\\n\t" % self.sample_data[sample]["bam"]
            if not "nocontrol" in self.params.keys():
                self.script += "-c %s \\\n\t" % self.sample_data[control]["bam"]
        
            # Add output directory
            self.script += "--name %s \n\n" % output_filename
            self.script += "--outdir %s \n\n" % use_dir
            
            # Storing the output file in samples_data
            self.sample_data[sample]["macs2_prefix"] = "".join([sample_dir, output_filename])
            self.sample_data[sample]["peak_bed"] = "".join([sample_dir, output_filename, "_peaks.bed"])
            self.sample_data[sample]["peak_xls"] = "".join([sample_dir, output_filename, "_peaks.xls"])
            self.sample_data[sample]["summit_bed"] = "".join([sample_dir, output_filename, "_summits.bed"])

            # Set active bed to peak_bed. Maybe let user decide?
            self.sample_data[sample]["bed"] = self.sample_data[sample]["peak_bed"] 

            self.stamp_file(self.sample_data[sample]["peak_bed"])
            self.stamp_file(self.sample_data[sample]["peak_xls"])
            self.stamp_file(self.sample_data[sample]["summit_bed"])

            # Storing bedgraph files if should exist:
            
            if "--bdg" in self.params["redir_params"] or "-B" in self.params["redir_params"]:
                self.sample_data[sample]["control_lambda"] = "".join([sample_dir, output_filename, "_control_lambda.bdg"])
                self.sample_data[sample]["treat_pileup"] = "".join([sample_dir, output_filename, "_treat_pileup.bdg"])
                # Saving the treatment pileup bdg file as the main mapping bdg
                # (saving in mapping because it is a derivation of the mapping data)
                self.sample_data[sample]["bdg"] = "".join([sample_dir, output_filename, "_treat_pileup.bdg"])
                # Saving the control pileup bdg file as the main control bdg
                self.sample_data[control]["bdg"] = "".join([sample_dir, output_filename, "_control_lambda.bdg"])
                # Stamping all bdg files
                self.stamp_file(self.sample_data[sample]["control_lambda"])
                self.stamp_file(self.sample_data[sample]["treat_pileup"])
            
            ##############################
            # # Add conversion of peak bed to bigbed
            if "bedToBigBed_path" in self.params.keys():
                if not "chrom.sizes" in self.params.keys():
                    raise AssertionExcept("If bedToBigBed_path is passed, you also must sepcify a 'chrom.sizes' path")
                out_bed_filename = "%s.cut.bed" % self.sample_data[sample]["bed"]
                out_bb_filename = "%s.cut.bb" % self.sample_data[sample]["bed"]

# Probably better to use the following perl one-liner:
# perl -e 'while($line=<>){$line=~s/((?:\S*\s*){3})\.\d*(\s.*)/$1$2/; print $line}' bed_file                
# This will retain only the int part in column 4 while leaving the rest intact.
                self.script += """

                
                
#### Convert bed to bigbed for UCSC browser



if [ -e %(in_bed)s ] 
then
    # First, removing final column which has a float instead of an integer:
    cat \\
        %(in_bed)s \\
        | cut -f 1-4 \\
        > %(out_bed)s

    # Convert to bb
    %(exec_path)s \\
        %(out_bed)s \\
        %(chrom_sizes)s \\
        %(out_bb)s
fi

                        """ % {"in_bed"      : self.sample_data[sample]["bed"],   \
                               "exec_path"   : self.params["bedToBigBed_path"],                    \
                               "chrom_sizes" : self.params["chrom.sizes"],                         \
                               "out_bb"      : out_bb_filename,                                    \
                               "out_bed"     : out_bed_filename}
                
                self.sample_data[sample]["bb"] = out_bb_filename
                # Stamping bb file
                self.stamp_file(self.sample_data[sample]["bb"])
                
            ##############################
            # # Add extration of peak fasta sequences
            if "getfasta" in self.params.keys():
                try:
                    self.sample_data[sample]["reference"]
                except KeyError:
                    self.write_warning("In %s: No reference exists, but you asked for a fasta file for the peaks. \n\tIn order to get the file you have to set a reference genome in the mapping step (Bowtie in particular)\n")

              
                else:
                
                    self.script += """
# Extract peaks from BED file to fasta file:

if [ -e %(peaks)s ] 
then
    %(exec_path)s
        -fi %(ref_fasta)s \\
        -bed %(bed_file)s  > %(bed_file)s.fasta
fi

                        """ % {"peaks" :     self.sample_data[sample]["peak_bed"],          \
                               "exec_path" : self.params["getfasta"],                                   \
                               "ref_fasta" : self.sample_data[sample]["reference"], \
                               "bed_file"  : self.sample_data[sample]["bed"]}
           
                self.sample_data[sample]["peak_fasta"] = "%s.fasta" % self.sample_data[sample]["bed"]
                self.sample_data[sample]["fasta.nucl"] = self.sample_data[sample]["peak_fasta"]
                # Stamping bb file
                self.stamp_file(self.sample_data[sample]["peak_fasta"])

        
        
            # Wrapping up function. Leave these lines at the end of every iteration:
            self.local_finish(use_dir,sample_dir)       # Sees to copying local files to final destination (and other stuff)

            
            self.create_low_level_script()
コード例 #28
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """

        if self.params["scope"] == "project":

            # # Not defined yet
            # raise AssertionExcept("Assembly for scope 'project' is not defined yet in %s\n")
            # # See clc_assembl for definition, also in step_sample_initiation() of clc_assembl...

            # Name of specific script:
            self.spec_script_name = "_".join(
                [self.step, self.name, self.sample_data["Title"]])
            self.script = ""

            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(self.base_dir)

            # Megahit requires that the sample dir not exist! Removing:
            self.script += "rm -rf %s\n\n" % use_dir

            out_prefix = self.sample_data["Title"] + self.file_tag

            self.script += self.get_script_const()

            f_reads_csl = ""
            r_reads_csl = ""
            s_reads_csl = ""

            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash
                if "PE" in self.sample_data[sample]["type"]:
                    f_reads_csl += "%s,\\\n\t\t" % self.sample_data[sample][
                        "fastq.F"]
                    r_reads_csl += "%s,\\\n\t\t" % self.sample_data[sample][
                        "fastq.R"]
                if "SE" in self.sample_data[sample]["type"]:
                    s_reads_csl += "%s,\\\n\t\t" % self.sample_data[sample][
                        "fastq.S"]
                if "PE" not in self.sample_data[sample][
                        "type"] and "SE" not in self.sample_data[sample][
                            "type"]:
                    raise AssertionExcept(
                        "Strange type configuration for sample\n", sample)

            # Interlaced reads to treated here. Maybe one day...
            if f_reads_csl:
                self.script += "-1 " + f_reads_csl.strip(
                    ",\\\n\t\t") + " \\\n\t"
                self.script += "-2 " + r_reads_csl.strip(
                    ",\\\n\t\t") + " \\\n\t"
            if s_reads_csl:
                self.script += "-r " + s_reads_csl.strip(
                    ",\\\n\t\t") + " \\\n\t"

            self.script += "--out-dir %s \\\n\t" % use_dir
            self.script += "--out-prefix %s \n\n" % out_prefix

            # Store results to fasta and assembly slots:
            self.sample_data[
                "fasta.nucl"] = self.base_dir + out_prefix + ".contigs.fa"
            self.sample_data[self.get_step_step() +
                             "_contigs"] = self.sample_data["fasta.nucl"]
            self.sample_data["assembler"] = self.get_step_step()

            self.stamp_file(self.sample_data[self.get_step_step() +
                                             "_contigs"])

            # Wrapping up function. Leave these lines at the end of every iteration:
            self.local_finish(
                use_dir, self.base_dir
            )  # Sees to copying local files to final destination (and other stuff)

            self.create_low_level_script()

        else:  # self.params["scope"] == "sample"

            # Each iteration must define the following class variables:
            # spec_script_name
            # script
            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash

                # Name of specific script:
                self.spec_script_name = "_".join(
                    [self.step, self.name, sample])
                self.script = ""

                # Make a dir for the current sample:
                sample_dir = self.make_folder_for_sample(sample)

                # This line should be left before every new script. It sees to local issues.
                # Use the dir it returns as the base_dir for this step.
                use_dir = self.local_start(sample_dir)

                # Megahit requires that the sample dir not exist! Removing:
                self.script += "rm -rf %s\n\n" % use_dir

                out_prefix = sample + self.file_tag

                self.script += self.get_script_const()
                self.script += "--out-dir %s \\\n\t" % sample_dir
                self.script += "--out-prefix %s \\\n\t" % out_prefix

                if "PE" in self.sample_data[sample]["type"]:
                    self.script += "-1 %s \\\n\t" % self.sample_data[sample][
                        "fastq.F"]
                    self.script += "-2 %s \n\n" % self.sample_data[sample][
                        "fastq.R"]
                elif "SE" in self.sample_data[sample]["type"]:
                    self.script += "-r %s \n\n" % self.sample_data[sample][
                        "fastq.S"]
                elif "PE" in self.sample_data[sample][
                        "type"] and "SE" in self.sample_data[sample][
                            "type"]:  # Mixed!!
                    self.script += "-1 %s \\\n\t" % self.sample_data[sample][
                        "fastq.F"]
                    self.script += "-2 %s \\\n\t" % self.sample_data[sample][
                        "fastq.R"]
                    self.script += "-r %s \n\n" % self.sample_data[sample][
                        "fastq.S"]
                else:
                    raise AssertionExcept(
                        "Strange type configuration for sample\n", sample)

                # Store results to fasta and assembly slots:
                self.sample_data[sample][
                    "fasta.nucl"] = sample_dir + out_prefix + ".contigs.fa"
                self.sample_data[sample][
                    self.get_step_step() +
                    "_contigs"] = self.sample_data[sample]["fasta.nucl"]
                self.sample_data[sample]["assembler"] = self.get_step_step()

                self.stamp_file(self.sample_data[sample][self.get_step_step() +
                                                         "_contigs"])

                # Wrapping up function. Leave these lines at the end of every iteration:
                self.local_finish(
                    use_dir, sample_dir
                )  # Sees to copying local files to final destination (and other stuff)

                self.create_low_level_script()
コード例 #29
0
    def build_scripts(self):
        """ This is the actual script building function
            Most, if not all, editing should be done here 
            HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME!
        """

        if self.params["scope"] == "project":

            # Name of specific script:
            self.spec_script_name = "_".join(
                [self.step, self.name, self.sample_data["Title"]])
            self.script = ""

            #################
            ##################
            ##
            ## TODO
            ##
            ## 1. Output sample list into file in data/mpileup_varscan
            ## 2. Add parameter for user to pass mpileup location + arguments
            ## 3. Create the mpileup command
            ## 4. pipe the output to varscan. (set script_path to location of varscan)
            ## 5. pipe varscan output to file location

            # This line should be left before every new script. It sees to local issues.
            # Use the dir it returns as the base_dir for this step.
            use_dir = self.local_start(self.base_dir)

            ### Getting location of reference
            # Get list of reference fasta files from samples, and convert to set, removing duplicates
            reference_fasta = set([
                self.sample_data[sample]["reference"]
                for sample in self.sample_data["samples"]
            ])
            # If there are more than one reference_fasta, exit. This is really really weird and should not happen
            if len(reference_fasta) > 1:
                raise AssertionExcept(
                    "There is more than one reference file for different samples. Weird!!!"
                )
            # Convert set into list and return first, and only, element:
            reference_fasta = list(reference_fasta)[0]

            ### Create file with list of sample names, one per line
            with open(self.base_dir + "sample_list.txt", "w") as smp_lst:
                for sample in self.sample_data[
                        "samples"]:  # Getting list of samples out of samples_hash
                    smp_lst.write("%s\n" % sample)

            # Define output_suffix depending on redir_params:
            # If either --VCF or -v are passed by user
            if "--output-vcf" in self.params["redir_params"]:
                output_suffix = "vcf"
            else:
                output_suffix = "unknown"

            # Get constant part of script:
            self.script += "%s \\\n\t" % self.params["mpileup_path"]
            # Reference file:
            self.script += "-f %s \\\n\t" % reference_fasta
            # BAM files:
            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash
                self.script += "%s \\\n\t" % self.sample_data[sample]["bam"]
            # Remove extra stuff from end of script:
            self.script = self.script.rstrip("\\\n\t")
            # self.script = self.script.rstrip()
            self.script += " | \\\n"

            self.script += "%s \\\n\t" % self.params["script_path"]
            self.script += self.get_redir_parameters_script()
            self.script += "--vcf-sample-list %s \\\n\t" % (self.base_dir +
                                                            "sample_list.txt")
            self.script += "> %s\n\n" % (
                use_dir + ".".join([self.sample_data["Title"], output_suffix]))

            if output_suffix == "vcf":
                self.sample_data["vcf"] = (
                    self.base_dir +
                    ".".join([self.sample_data["Title"], output_suffix]))
                self.stamp_file(self.sample_data["vcf"])
            else:
                self.sample_data["variants"] = (
                    self.base_dir +
                    ".".join([self.sample_data["Title"], output_suffix]))
                self.stamp_file(self.sample_data["variants"])

            # Move all files from temporary local dir to permanent base_dir
            self.local_finish(
                use_dir, self.base_dir
            )  # Sees to copying local files to final destination (and other stuff)

            self.create_low_level_script()

        else:
            for sample in self.sample_data[
                    "samples"]:  # Getting list of samples out of samples_hash

                # Make a dir for the current sample:
                sample_dir = self.make_folder_for_sample(sample)

                # Name of specific script:
                self.spec_script_name = "_".join(
                    [self.step, self.name, sample])
                self.script = ""

                # This line should be left before every new script. It sees to local issues.
                # Use the dir it returns as the base_dir for this step.
                use_dir = self.local_start(sample_dir)

                #################
                ##################
                ##
                ## TODO
                ##
                ## 1. Output sample list into file in data/mpileup_varscan
                ## 2. Add parameter for user to pass mpileup location + arguments
                ## 3. Create the mpileup command
                ## 4. pipe the output to varscan. (set script_path to location of varscan)
                ## 5. pipe varscan output to file location

                # Get list of reference fasta files from samples, and convert to set, removing duplicates
                reference_fasta = self.sample_data[sample]["reference"]

                ### Create file with list of sample names, one per line
                with open(use_dir + "sample_list.txt", "w") as smp_lst:
                    smp_lst.write("%s\n" % sample)

                # Define output_suffix depending on redir_params:
                # If either --VCF or -v are passed by user
                if "--output-vcf" in self.params["redir_params"]:
                    output_suffix = "vcf"
                else:
                    output_suffix = "unknown"

                # Get constant part of script:
                self.script += "%s \\\n\t" % self.params["mpileup_path"]
                # Reference file:
                self.script += "-f %s \\\n\t" % reference_fasta
                # BAM file:
                self.script += "%s \\\n\t" % self.sample_data[sample]["bam"]
                # Remove extra stuff from end of script:
                self.script = self.script.rstrip("\\\n\t")
                # self.script = self.script.rstrip()
                self.script += " | \\\n"

                self.script += "%s \\\n\t" % self.params["script_path"]
                self.script += self.get_redir_parameters_script()
                self.script += "--vcf-sample-list %s \\\n\t" % (
                    use_dir + "sample_list.txt")
                self.script += "> %s%s_%s.%s\n\n" % (
                    use_dir, sample, self.get_step_name(), output_suffix)

                if output_suffix == "vcf":

                    self.sample_data[sample]["vcf"] = "%s%s_%s.vcf" % (
                        use_dir, sample, self.get_step_name())
                    self.sample_data[sample]["vcf.source"] = "varscan"
                    self.stamp_file(self.sample_data[sample]["vcf"])

                else:
                    self.sample_data[sample]["variants"] = "%s%s_%s.%s" % (
                        use_dir, sample, self.get_step_name(), output_suffix)
                    self.sample_data[sample]["variants.source"] = "varscan"
                    self.stamp_file(self.sample_data[sample]["variants"])

                # Move all files from temporary local dir to permanent base_dir
                self.local_finish(
                    use_dir, self.base_dir
                )  # Sees to copying local files to final destination (and other stuff)

                self.create_low_level_script()