def step_sample_initiation_bysample(self): # Creating holder for output: for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Make sure a file exists in the sample equivalent to dbtype: try: dbtype = self.params["redir_params"]["-dbtype"] self.sample_data[sample]["fasta"][dbtype] except KeyError: raise AssertionExcept( "No file exists in sample for specified -dbtype (%s)\n" % dbtype, sample) # initialize blast and blastdb slots for sample: if not "blast" in self.sample_data[sample].keys(): self.sample_data[sample]["blast"] = dict() if not "blastdb" in self.sample_data[sample]["blast"].keys(): self.sample_data[sample]["blast"]["blastdb"] = dict()
def step_sample_initiation(self): """ A place to do initiation stages following setting of sample_data Here you should do testing for dependency output. These will NOT exist at initiation of this instance. They are set only following sample_data updating """ # # If does not exist # try: # self.sample_data["qiime"] # except KeyError: # raise AssertionExcept("It seems like qiime_demult is the first qiime step. At the moment, it must come after qiime_prep...\n" ) try: self.sample_data["fasta.nucl"] except KeyError: raise AssertionExcept("fasta file does not exist.\n") try: self.sample_data["otu_table"] except KeyError: self.write_warning("otu table does not exist.\n")
def build_scripts(self): # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, self.sample_data["Title"]]) self.script = "" # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(self.base_dir) if "fasta.aligned" not in self.sample_data.keys(): raise AssertionExcept( "You are trying to run 'make_phylogeny' on an unaligned fasta file!\n" ) outfile = os.path.basename(self.sample_data["fasta.nucl"]) outfile = re.sub("\.(fas|fasta|fna|fa)$", "", outfile) + ".tre" logfile = ".".join([outfile, "log"]) ### Step 1b: Adding demultiplexing tyo script: self.script += self.get_script_const( ) # Gets the "env", "script_path" and "redir_params" part of the script which is always the same... self.script += "-i %s \\\n\t" % self.sample_data["fasta.nucl"] # self.script += "-o %s \n\n" % self.base_dir self.script += "-o %s \\\n\t" % "".join([use_dir, outfile]) self.script += "-l %s \n\n" % "".join([use_dir, logfile]) # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) # Store location of the phylogenetic tree: self.sample_data["phylotree"] = self.base_dir + outfile # self.stamp_dir_files(self.base_dir) self.create_low_level_script()
def step_sample_initiation(self): """ A place to do initiation stages following setting of sample_data """ if self.params["scope"] == "sample": for sample in self.sample_data["samples"]: if self.params["input"] == "vcf": try: self.sample_data[sample]["vcf"] except KeyError: raise AssertionExcept( "Sample does not have a VCF variants file.", sample) elif self.params["input"] == "bcf": try: self.sample_data[sample]["bcf"] except KeyError: raise AssertionExcept( "Sample does not have a BCF variants file.", sample) else: try: self.sample_data[sample]["gzVCF"] except KeyError: raise AssertionExcept( "Sample does not have a gzVCF variants file.", sample) else: # Scope == project if self.params["input"] == "vcf": try: self.sample_data["vcf"] except KeyError: raise AssertionExcept( "Sample does not have a VCF variants file.", sample) elif self.params["input"] == "bcf": try: self.sample_data["bcf"] except KeyError: raise AssertionExcept( "Sample does not have a BCF variants file.", sample) else: try: self.sample_data["gzVCF"] except KeyError: raise AssertionExcept( "Sample does not have a gzVCF variants file.", sample)
def step_sample_initiation(self): """ A place to do initiation stages following setting of sample_data Here you should do testing for dependency output. These will NOT exist at initiation of this instance. They are set only following sample_data updating """ # Testing the existance of a legitimate mapping file in the pipeline parameter file or in self.sample_data: # Check if mapping file exists in parameters (overrides mapping from sample_data) if "--mapping_fp" in self.params["redir_params"].keys( ) or "-m" in self.params["redir_params"].keys(): # Check if mapping file exists in sample_data if "qiime.mapping" in self.sample_data.keys(): self.write_warning( "Overriding existing mapping file. Make sure this is OK") # mapping_fp = self.sample_data["qiime.mapping"] self.sample_data["qiime.mapping"] = self.params["redir_params"][ "--mapping_fp"] if "--mapping_fp" in self.params[ "redir_params"].keys( ) else self.params["redir_params"]["-m"] else: if "qiime.mapping" not in self.sample_data.keys(): raise AssertionExcept( "No mapping file exists nor was it passed with -m")
def step_sample_initiation(self): """ A place to do initiation stages following setting of sample_data """ # Initializing a "mapping" dict for each sample: for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash try: self.sample_data[sample]["fastq"]["mapping"] except KeyError: self.sample_data[sample]["fastq"]["mapping"] = {} else: self.write_warning( "mapping dict exists for sample %s. Double mapping steps?\n", sample) # Require either 'scope' or '-x': if "scope" in self.params: # If scope defined, comment if also -x exists. if "-x" in self.params["redir_params"]: raise AssertionExcept("Both 'scope' and '-x' specified!\n") try: # Loop over samples to set the reference genome: for sample in self.sample_data["samples"]: if self.params["scope"] == "project": # Set project wide reference: self.sample_data[sample]["fastq"]["mapping"][ "reference"] = self.sample_data["bowtie2"]["fasta"] elif self.params["scope"] == "sample": # Set per-sample reference: self.sample_data[sample]["fastq"]["mapping"][ "reference"] = self.sample_data[sample]["bowtie2"][ "fasta"] else: raise AssertionExcept( "Scope must be either 'sample' or 'project'") except KeyError: raise AssertionExcept( "There is a mismatch between 'scope' and the existing bowtie2 index\n", sample) if "ref_genome" in self.params.keys(): raise AssertionExcept( "ref_genome was passed, and 'scope' was defined. Ignoring ref_genome\n" ) else: # If scope is not defined, require '-x' if not "-x" in self.params["redir_params"]: raise AssertionExcept("Neither 'scope' nor '-x' specified.\n") # Storing reference genome for use by downstream steps: if "ref_genome" in self.params.keys(): for sample in self.sample_data["samples"]: # If reference already exists, ignore ref_genome if "reference" in self.sample_data[sample]["fastq"][ "mapping"]: self.write_warning( "ref_genome was passed, but a reference already exists. Setting reference to 'ref_genome'\n" ) self.sample_data[sample]["fastq"]["mapping"][ "reference"] = self.params["ref_genome"] else: self.write_warning( "No reference given. It is highly recommended to give one!\n" )
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ # Each iteration must define the following class variables: # self.spec_script_name # self.script for sample in self.sample_data["samples"]: # Getting list of samples out of samples_hash # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # Name of specific script: self.spec_script_name = "_".join([self.step,self.name,sample]) self.script = "" # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) # sam_file = self.sample_data[sample]["fastq"]["mapping"]["sam"] input_file = self.sample_data[sample]["fastq"]["mapping"][self.file2use] # bam_name = self.sample_data[sample]["fastq"]["mapping"]["sam"] + ".bam" bam_name = os.path.basename(input_file) + ".bam" output_sam_name = os.path.basename(input_file) + ".sam" #might be used... if "filter_by_tag" in self.params.keys(): filtered_name = bam_name + ".filt.bam" sort_name = filtered_name + ".srt.bam" else: sort_name = bam_name + ".srt.bam" index_name = sort_name + ".bai" if "view" in self.params.keys(): self.script += "###########\n# Running samtools view:\n#----------------\n" self.script += "%s view \\\n\t" % self.get_script_env_path() if self.params["view"]: self.script += "%s \\\n\t" % self.params["view"] tobam = re.search("\-\w*b",self.params["view"]) if tobam: self.script += "-o %s \\\n\t %s\n\n" % (use_dir + bam_name,input_file) self.sample_data[sample]["fastq"]["mapping"]["bam"] = sample_dir + bam_name else: self.script += "-o %s \\\n\t %s\n\n" % (use_dir + output_sam_name,input_file) self.sample_data[sample]["fastq"]["mapping"]["sam"] = sample_dir + output_sam_name self.stamp_file(self.sample_data[sample]["fastq"]["mapping"]["sam"]) self.write_warning("Output from samtools view is SAM. Not proceeding further.\nTo produce a BAM, make sure to include the -b flag in the samtools view parameters.\n") # If sam output, can't proceed with rest of commands which require bam input_file: # Move all files from temporary local dir to permanent base_dir self.local_finish(use_dir,sample_dir) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script() continue # The following can be merged into the main 'view' section if "filter_by_tag" in self.params.keys(): self.script += "###########\n# Filtering BAM\n#----------------\n" self.script += "\n\n" self.script += "%s view \\\n\t" % self.get_script_env_path() self.script += "-h \\\n\t" self.script += "%s | \\\n\t" % self.sample_data[sample]["fastq"]["mapping"]["bam"] self.script += "awk '$0 ~\"(^@)|(%s)\"' | \\\n\t" % self.params["filter_by_tag"] self.script += "%s view \\\n\t" % self.get_script_env_path() self.script += "-bh \\\n\t" self.script += "-o %s \\\n\t" % (use_dir + filtered_name) self.script += "- \n\n" # If user requires than unsorted bam be removed: if "del_unfiltered" in self.params.keys(): self.script += "###########\n# Removing unfiltered BAM\n#----------------\n" self.script += "\n\nrm -rf %s\n\n" % (use_dir + bam_name) # Stroing filtered and unfiltered bams: self.sample_data[sample]["fastq"]["mapping"]["unfiltered_bam"] = sample_dir + bam_name self.sample_data[sample]["fastq"]["mapping"]["bam"] = sample_dir + filtered_name # The following is so that sort will work on the filtered file without playing around with the sort code: bam_name = filtered_name if "sort" in self.params.keys(): # This permits running only sort and index, in case a bam file was produced in a differnet step. if "view" in self.params.keys(): bam_name = use_dir + bam_name else: if "bam" in self.sample_data[sample]["fastq"]["mapping"].keys(): bam_name = self.sample_data[sample]["fastq"]["mapping"]["bam"] elif "sam" in self.sample_data[sample]["fastq"]["mapping"].keys(): bam_name = self.sample_data[sample]["fastq"]["mapping"]["sam"] self.write_warning("Can't find BAM but found SAM for sample. Using it instead of a BAM.\n", sample) else: raise AssertionExcept("Can't run sort without BAM file. Either include 'view' or use other BAM creating steps.\n",sample) self.script += "###########\n# Sorting BAM\n#----------------\n" self.script += "%s sort \\\n\t" % self.get_script_env_path() if self.params["sort"]: self.script += "%s \\\n\t" % self.params["sort"] self.script += "-o %s \\\n\t" % (use_dir + sort_name) self.script += "%s\n\n" % (bam_name) # Storing sorted bam in 'bam' slot and unsorted bam in unsorted_bam slot self.sample_data[sample]["fastq"]["mapping"]["unsorted_bam"] = sample_dir + os.path.basename(bam_name) self.sample_data[sample]["fastq"]["mapping"]["bam"] = sample_dir + sort_name # If user requires than unsorted bam be removed: if "del_unsorted" in self.params.keys(): self.script += "###########\n# Removing unsorted BAM\n#----------------\n" self.script += "\n\nrm -rf %s\n\n" % (bam_name) bam_name = sort_name # Use sorted bam from now on below if "index" in self.params.keys(): self.script += "###########\n# Indexing BAM\n#----------------\n" self.script += "%s index \\\n\t" % self.get_script_env_path() if self.params["index"]: self.script += "%s \\\n\t" % self.params["index"] self.script += "%s\n\n" % (use_dir + bam_name) self.sample_data[sample]["fastq"]["mapping"]["index"] = sample_dir + index_name if "flagstat" in self.params.keys(): self.script += "###########\n# Calculating BAM statistics:\n#----------------\n" self.script += "%s flagstat \\\n\t" % self.get_script_env_path() self.script += "%s \\\n\t" % (use_dir + bam_name) self.script += "> %s.flagstat \n\n" % (use_dir + bam_name) self.sample_data[sample]["fastq"]["mapping"]["flagstat"] = "%s%s.flagstat" % (sample_dir, bam_name) if "stats" in self.params.keys(): self.script += "###########\n# Calculating BAM statistics:\n#----------------\n" self.script += "%s stats \\\n\t" % self.get_script_env_path() if self.params["stats"]: # Adding parameters the user might pass self.script += "%s \\\n\t" % self.params["stats"] self.script += "%s \\\n\t" % (use_dir + bam_name) self.script += "> %s.stats \n\n" % (use_dir + bam_name) self.sample_data[sample]["fastq"]["mapping"]["stats"] = "%s%s.stats" % (sample_dir, bam_name) if "idxstats" in self.params.keys(): self.script += "###########\n# Calculating index statistics (idxstats):\n#----------------\n" self.script += "%s idxstats \\\n\t" % self.get_script_env_path() # idxstats has no uder defined parameters... self.script += "%s \\\n\t" % (use_dir + bam_name) self.script += "> %s.idxstat.tab \n\n" % (use_dir + bam_name) self.sample_data[sample]["fastq"]["mapping"]["stats"] = "%s%s.stats" % (sample_dir, bam_name) self.sample_data[sample]["fastq"]["mapping"]["idxstats"] = "%s%s.idxstat.tab" % (sample_dir, bam_name) if "del_sam" in self.params.keys() and "sam" in self.sample_data[sample]["fastq"]["mapping"]: self.script += "###########\n# Removing SAM\n#----------------\n\n" self.script += "rm -rf %s\n\n" % self.sample_data[sample]["fastq"]["mapping"]["sam"] # self.stamp_dir_files(sample_dir) # Move all files from temporary local dir to permanent base_dir self.local_finish(use_dir,sample_dir) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ if self.params["scope"] == "project": # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, self.sample_data["Title"]]) self.script = "" # Make a dir for the current sample: output_file = os.sep.join( [self.base_dir, self.sample_data["Title"] + self.file_tag]) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(self.base_dir) self.script += self.get_script_const() self.script += "-o %s \\\n\t" % output_file for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash if "PE" in self.sample_data[sample][ "type"] and "SE" in self.sample_data[sample]["type"]: print >> sys.stdout, "CLC assembler not defined for PE-SE mixes. Using PE file only..." if "PE" in self.sample_data[sample]["type"]: try: self.script += "-p %s \\\n\t-q \\\n\t" % self.params[ "p"] except KeyError: raise AssertionExcept( "With paired end reads, you must specify a 'p' parameter containing information to pass with '-p' to clc_assembler. See the clc manual." ) self.script += "-i %s %s \\\n\t" % ( self.sample_data[sample]["fastq.F"], self.sample_data[sample]["fastq.R"]) elif "SE" in self.sample_data[sample]["type"]: self.script += "-p no \\\n\t-q \\\n\t%s \\\n\t" % self.sample_data[ sample]["fastq.S"] else: # Mixed!! pass # Remove trailing '\\\n\t' from last iteration and add some newlines for clarity self.script = self.script.rstrip("\\\n\t") + "\n\n" # Store results to fasta and assembly slots: self.sample_data["fasta.nucl"] = output_file self.sample_data[self.step + ".contigs"] = output_file self.sample_data["assembler"] = self.get_step_step() self.stamp_file(self.sample_data[self.step + ".contigs"]) # Wrapping up function. Leave these lines at the end of every iteration: self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script() else: # Each iteration must define the following class variables: # spec_script_name # script for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, sample]) self.script = "" # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) output_file = os.sep.join([sample_dir, sample + self.file_tag]) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) self.script += self.get_script_const() self.script += "-o %s \\\n\t" % output_file if "mixed" in self.sample_data[sample]["type"]: print >> sys.stdout, "CLC assembler not defined for PE-SE mixes. Using PE file only..." if "PE" in self.sample_data[sample][ "type"] or "mixed" in self.sample_data[sample]["type"]: self.script += "-p %s \\\n\t-q \\\n\t" % self.params["p"] self.script += "-i %s %s\n\n" % ( self.sample_data[sample]["fastq.F"], self.sample_data[sample]["fastq.R"]) elif "SE" in self.sample_data[sample]["type"]: self.script += "-p no \\\n\t-q \\\n\t%s\n\n" % self.sample_data[ sample]["fastq.S"] else: # Mixed!! pass # Store results to fasta and assembly slots: self.sample_data[sample]["fasta.nucl"] = output_file self.sample_data[sample][self.step + ".contigs"] = output_file self.sample_data[sample]["assembler"] = self.get_step_step() self.stamp_file(self.sample_data[sample][self.step + ".contigs"]) # Wrapping up function. Leave these lines at the end of every iteration: self.local_finish( use_dir, sample_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def build_scripts(self): # Each iteration must define the following class variables: # spec_script_name # script for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # General comment: If there is a parallel routine for each direction (forward, reverse), add this loop # if in self.sample_data[sample].keys(): # Loop over all **existing** Forward, Reverse and Single slots: # The filter returns a list of keys in sample_data that are in the list ["Forward","Reverse","Single"] for direction in filter( lambda x: x in ["Forward", "Reverse", "Single"], self.sample_data[sample].keys()): self.script = "" direction_tag = direction[0] # Get first letter in direction # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, sample, direction_tag]) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(self.base_dir) # Get all unique extensions of files in direction: extensions = list( set([ os.path.splitext(fn)[1] for fn in self.sample_data[sample][direction] ])) # Find file extension of first input file and remove extra period at the begining of extension (note the [1:] at the end.): extension = os.path.splitext( self.sample_data[sample][direction][0])[1][1:] # Remove zip extension: if "." + extension in ZIPPED_EXTENSIONS: # Get last extension before the '.gz', and remove the leading period (note the [1:] at the end.) extension = os.path.splitext( os.path.splitext( self.sample_data[sample][direction][0])[0])[1][1:] if "." + extension not in KNOWN_FILE_EXTENSIONS: raise AssertionExcept( "One of the files has a really weird extension (%s). Make sure this is not a mistake, or update KNOWN_FILE_EXTENSIONS or ZIPPED_EXTENSIONS in global_def.py\n" % extension, sample) fq_fn = ".".join( [sample, direction_tag, self.file_tag, extension] ) #The filename containing the end result. Used both in script and to set reads in $sample_params self.script += self.params["script_path"] + " \\\n\t" # The following line concatenates all the files in the direction separated by a " " self.script += " ".join(self.sample_data[sample][direction]) self.script += " \\\n\t" if "pipe" in self.params: self.script += "| {pipe} \\\n\t".format( pipe=self.params["pipe"]) self.script += "> %s%s \n\n" % (use_dir, fq_fn) # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) # Store file in active file for sample: self.sample_data[sample]["fastq." + direction_tag] = self.base_dir + fq_fn self.stamp_file(self.sample_data[sample]["fastq." + direction_tag]) self.create_low_level_script() # Merging files in "fasta" dict in sample_data (genomes etc.) # Loop over all **existing** fasta slots: # The filter returns a list of keys in sample_data that are in the keys of dict "fasta_types_dict" for direction in filter(lambda x: x in fasta_types_dict.keys(), self.sample_data[sample].keys()): self.script = "" direction_tag = fasta_types_dict[direction] # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, sample, direction_tag]) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(self.base_dir) # Get all unique extensions of files in direction: extensions = list( set([ os.path.splitext(fn)[1] for fn in self.sample_data[sample][direction] ])) # Find file extension of first input file and remove extra period at the begining of extension (note the [1:] at the end.): extension = os.path.splitext( self.sample_data[sample][direction][0])[1][1:] # Remove zip extension: if "." + extension in ZIPPED_EXTENSIONS: # Get last extension before the '.gz', and remove the leading period (note the [1:] at the end.) extension = os.path.splitext( os.path.splitext( self.sample_data[sample][direction][0])[0])[1][1:] if "." + extension not in KNOWN_FILE_EXTENSIONS: raise AssertionExcept( "One of the files in sample has a really weird extension (%s). \n\tMake sure this is not a mistake, or update KNOWN_FILE_EXTENSIONS\n" % extension, sample) fq_fn = ".".join( [sample, direction_tag, self.file_tag, extension] ) #The filename containing the end result. Used both in script and to set reads in $sample_params # You have to add "use existing" functionality self.script += self.params["script_path"] + " \\\n\t" # The following line concatenates all the files in the direction separated by a " " self.script += " ".join(self.sample_data[sample][direction]) self.script += " \\\n\t" if "pipe" in self.params: self.script += "| {pipe} \\\n\t".format( pipe=self.params["pipe"]) self.script += "> %s%s \n\n" % (use_dir, fq_fn) # # Store file in active file for sample: self.sample_data[sample][direction_tag] = self.base_dir + fq_fn self.stamp_file(self.sample_data[sample][direction_tag]) # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script() for direction in filter(lambda x: x in sam_bam_dict.keys(), self.sample_data[sample].keys()): # Do not attempt merging the single reference permitted: if direction == "REFERENCE": continue self.script = "" direction_tag = sam_bam_dict[direction] # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, sample, direction_tag]) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(self.base_dir) # Get all unique extensions of files in direction: extensions = list( set([ os.path.splitext(fn)[1] for fn in self.sample_data[sample][direction] ])) # Find file extension of first input file and remove extra period at the begining of extension (note the [1:] at the end.): extension = os.path.splitext( self.sample_data[sample][direction][0])[1][1:] # Remove zip extension: if "." + extension in ZIPPED_EXTENSIONS: # Get last extension before the '.gz', and remove the leading period (note the [1:] at the end.) extension = os.path.splitext( os.path.splitext( self.sample_data[sample][direction][0])[0])[1][1:] if "." + extension not in KNOWN_FILE_EXTENSIONS: raise AssertionExcept( "One of the files in sample has a really weird extension (%s). \n\tMake sure this is not a mistake, or update KNOWN_FILE_EXTENSIONS\n" % extension, sample) fq_fn = ".".join( [sample, direction_tag, self.file_tag, extension] ) #The filename containing the end result. Used both in script and to set reads in $sample_params # You have to add "use existing" functionality self.script += self.params["script_path"] + " \\\n\t" # The following line concatenates all the files in the direction separated by a " " self.script += " ".join(self.sample_data[sample][direction]) self.script += " \\\n\t" if "pipe" in self.params: self.script += "| {pipe} \\\n\t".format( pipe=self.params["pipe"]) self.script += " > %s%s \n\n" % (use_dir, fq_fn) # # Store file in active file for sample: self.sample_data[sample][direction_tag] = self.base_dir + fq_fn self.sample_data[sample]["reference"] = self.sample_data[ sample]["REFERENCE"] self.stamp_file(self.sample_data[sample][direction_tag]) # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def step_sample_initiation(self): """ A place to do initiation stages following setting of sample_data """ if len(get_File_Type_data(self.params, ["copy_File_Types"])) > 0: for transfer in self.params["copy_File_Types"]: dif = set([ "source", "target", ]).difference(self.params["copy_File_Types"][transfer].keys()) if len(dif) == 0: scope_in = get_File_Type_data( self.params["copy_File_Types"], [transfer, "source", "scope"], "sample") scope_out = get_File_Type_data( self.params["copy_File_Types"], [transfer, "target", "scope"], "sample") File_Type_in = get_File_Type_data( self.params["copy_File_Types"], [transfer, "source", "File_Type"], None) File_Type_out = get_File_Type_data( self.params["copy_File_Types"], [transfer, "target", "File_Type"], None) if (File_Type_in and File_Type_out) != None: if "sample" in [scope_in, scope_out]: for sample in self.sample_data["samples"]: if scope_in == "sample": if File_Type_in in self.sample_data[ sample].keys(): if scope_out == "sample": self.sample_data[sample][ File_Type_out] = self.sample_data[ sample][File_Type_in] else: self.sample_data[ File_Type_out] = self.sample_data[ sample][File_Type_in] else: raise AssertionExcept( "The File_Type %s is not found in the SAMPLE level \n\t File_Types available are : %%s" % File_Type_in % self.sample_data[sample].keys()) else: if File_Type_in in self.sample_data.keys(): self.sample_data[sample][ File_Type_out] = self.sample_data[ File_Type_in] else: raise AssertionExcept( "The File_Type %s is not found in the PROJECT level \n\t File_Types available are : %%s" % File_Type_in % self.sample_data.keys()) else: if File_Type_in in self.sample_data.keys(): self.sample_data[ File_Type_out] = self.sample_data[ File_Type_in] else: raise AssertionExcept( "The File_Type %s is not found in the PROJECT level \n\t File_Types available are : %%s" % File_Type_in % self.sample_data.keys()) else: if File_Type_in == None: raise AssertionExcept( "The following argument/s are missing or empty in the copy_File_Types section: %s" % "source File_Type") if File_Type_out == None: raise AssertionExcept( "The following argument/s are missing or empty in the copy_File_Types section: %s" % "target File_Type") else: raise AssertionExcept( "The following argument/s are missing in the copy_File_Types section: %s" % list(dif)) if "scope" in self.params.keys(): if "project" in self.params["scope"]: self.step_sample_initiation_byproject() else: self.step_sample_initiation_bysample() else: self.step_sample_initiation_bysample() pass
def step_specific_init(self): self.shell = "bash" # Can be set to "bash" by inheriting instances # self.file_tag = "Bowtie_mapper" # if "--genomeDir" not in self.params["redir_params"]: # raise AssertionExcept("No --genomeDir specified. You must specify a STAR index of the genome.") if "ref_genome" not in self.params.keys(): self.write_warning( "No reference given with 'ref_genome' (path to fasta file). It is highly recommended to give one!\n" ) if "--runDirPerm" not in self.params["redir_params"]: self.params["redir_params"]["--runDirPerm"] = "All_RWX" self.write_warning("No --runDirPerm specified. Using 'All_RWX'") if "--outSAMtype" in self.params["redir_params"]: outSAMtype = re.split("\s+", self.params["redir_params"]["--outSAMtype"]) if outSAMtype[0] not in ["SAM", "BAM", "None"]: raise AssertionExcept( "Bad value for --outSAMtype: Has to be 'BAM', 'SAM' or 'None'" ) self.output_type = outSAMtype[0] if self.output_type == "BAM": self.bam_types = outSAMtype[1:] if "Unsorted" not in self.bam_types and "SortedByCoordinate" not in self.bam_types: raise AssertionExcept( "When --outSAMtype is BAM, you must supply a type: 'Unsorted', 'SortedByCoordinate' or both." ) else: self.output_type = "SAM" if "--outSAMattrRGline" in self.params["redir_params"]: if re.match("ID\:\S+", self.params["redir_params"]["--outSAMattrRGline"]): self.write_warning( "Removing 'ID:' from --outSAMattrRGline line!") self.params["outSAMattrRGline"] = re.sub( "ID\:\S+", "", self.params["redir_params"]["--outSAMattrRGline"]) if "--outWigType" in self.params["redir_params"]: outWigType = re.split("\s+", self.params["redir_params"]["--outWigType"]) if outWigType[0] not in ['None', 'bedGraph', 'wiggle']: raise AssertionExcept( "Bad value for --outWigType: Has to be 'None', 'bedGraph' or 'wiggle'" ) self.wig_type = outWigType[0] if self.wig_type == "wiggle": # See in build_scripts below. STAR produces 4 different wig files. Storing one in wig slot and others in wig* slots. self.write_warning( "Saving UniqueMultiple wig from strand 1 as main 'WIG' file. If you want something else, you have to move it to the right slot..." ) elif self.wig_type == "bedGraph": self.write_warning( "Saving UniqueMultiple bedGraph from strand 1 as main 'bdg' file. If you want something else, you have to move it to the right slot..." ) else: self.wig_type = "None" if "--quantMode" in self.params["redir_params"] and self.params[ "redir_params"]["--quantMode"] == "GeneCounts": self.write_warning( "--quantMode GeneCounts is not supported yet. The script will run but the output will not be stored in the file type index" ) # print self.output_type, self.bam_types # sys.exit() for redir2remove in [ "--readFilesCommand", "--readFilesIn", "--outFileNamePrefix", "--outTmpDir", "--outStd" ]: if redir2remove in self.params["redir_params"]: del self.params["redir_params"][redir2remove] self.write_warning( "You are not supposed to specify %s in redirects. We set it automatically" % redir2remove)
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ if self.params["scope"] == "project": # Not defined yet raise AssertionExcept("project wide scope is not defined yet\n") else: # Each iteration must define the following class variables: # spec_script_name # script for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, sample]) self.script = "" # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) self.script += self.get_script_const() self.script += "-o %s \\\n\t" % sample_dir if "PE" in self.sample_data[sample]["type"]: self.script += "--pe1-1 %s \\\n\t" % self.sample_data[ sample]["fastq"]["readsF"] self.script += "--pe1-2 %s \n\n" % self.sample_data[ sample]["fastq"]["readsR"] elif "SE" in self.sample_data[sample]["type"]: self.script += "--s1 %s \n\n" % self.sample_data[sample][ "fastq"]["readsS"] elif "PE" in self.sample_data[sample][ "type"] and "SE" in self.sample_data[sample][ "type"]: # Mixed!! self.script += "--pe1-1 %s \\\n\t" % self.sample_data[ sample]["fastq"]["readsF"] self.script += "--pe1-2 %s \\\n\t" % self.sample_data[ sample]["fastq"]["readsR"] self.script += "--s1 %s \n\n" % self.sample_data[sample][ "fastq"]["readsS"] else: raise AssertionExcept( "Strange type configuration for sample\n", sample) # For prokka compliance, you can request a truncation of the contig names # e.g. ">NODE_82_length_18610_cov_38.4999_ID_165" will be changed to ">NODE_82_length_18610" if "truncate_names" in self.params.keys(): self.script += """ # Truncating contig names for prokka compliance cat %(contigs)s | cut -f 1-2 -d '_' > %(shortnames)s mv -f %(shortnames)s %(contigs)s \n\n""" % { "contigs": sample_dir + "contigs.fasta", "shortnames": sample_dir + "contigs.shortIDs.fasta" } # Store results to fasta and assembly slots: self.sample_data[sample]["fasta"][ "nucl"] = sample_dir + "contigs.fasta" self.sample_data[sample]["assembly"]["spades_assembl"][ "contigs"] = sample_dir + "contigs.fasta" self.sample_data[sample]["assembly"]["spades_assembl"][ "scaffolds"] = sample_dir + "scaffolds.fasta" self.stamp_file(self.sample_data[sample]["assembly"] ["spades_assembl"]["scaffolds"]) self.stamp_file(self.sample_data[sample]["assembly"] ["spades_assembl"]["contigs"]) # Wrapping up function. Leave these lines at the end of every iteration: self.local_finish( use_dir, sample_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def build_scripts(self): # Each iteration MUST DEFINE the following class variables: # spec_script_name # script for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # General comment: If there is a parallel routine for each direction (forward, reverse), add this loop # Name of specific script: self.spec_script_name = "_".join([self.step, self.name, sample]) # Init script itself self.script = "" # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) # Make dir in links_for_demult folder: if not os.path.isdir(self.links_dir + sample): os.makedirs(self.links_dir + sample) # Define a set of directions to use for the sample directions_to_use = set() if self.sample_data[sample]["type"] == "SE": directions_to_use |= {"fastq.S"} else: # PE or mixed if self.params["join"].lower() == "join": directions_to_use |= {"fastq.J"} if self.params["unjoined"].lower( ) == "forward": # Options: forward, reverse, both or none directions_to_use |= { "fastq.F", "fastq.S" } # Assumption: in case of mixed PE and SE, if you want the forward then you want the single sequences too. elif self.params["unjoined"].lower( ) == "reverse": # Options: forward, reverse, both or none directions_to_use |= { "fastq.R", "fastq.S" } # Assumption: in case of mixed PE and SE, if you want the reverse then you want the single sequences too. elif self.params["unjoined"].lower( ) == "both": # Options: forward, reverse, both or none directions_to_use |= {"fastq.F", "fastq.R", "fastq.S"} else: # =="none" if not directions_to_use: # directions_to_use is empty! raise RuntimeError( "You can't pass 'none' to both 'join' and 'unjoined' parameters in step %s" % self.name) # If no join is required then only make links: if self.sample_data[sample]["type"] == "SE" or self.params[ "join"].lower() == "none": # directions contains existing files that appear in the sets to the right of the "&" directions = set( self.sample_data[sample].keys()) & directions_to_use for direction in directions: # print STDERR "$direction, $sample-->".$samples_hash->{$name}->{$sample}->{$direction}."\n"; link_name = "".join([ self.links_dir + sample + os.sep, ".".join([sample, direction[6], "fastq"]) ]) if os.path.exists(link_name): self.write_warning( "Link $link_name exists. Will overwrite when script is executed!!!!\n" ) cmd = "ln -sf %s %s" % ( self.sample_data[sample][direction], link_name) self.script += cmd + "\n\n" else: # If join required, then there are 2steps: run join and link resuults in links4demult ############# 1 # Add to $script joining code self.script += self.get_script_const( ) # Gets the "env", "script_path" and "redir_params" part of the script which is always the same... # if "env" in self.params.keys(): # Add optional environmental variables. # self.script += "env %s \\\n\t" % self.params["env"] # self.script += "%s \\\n\t" % self.params["script_path"] # for key in self.params["redir_params"].keys(): # self.script += "%s %s \\\n\t" % (key,self.params["redir_params"][key]) self.script += "-f " + self.sample_data[sample][ "fastq.F"] + " \\\n\t" self.script += "-r " + self.sample_data[sample][ "fastq.R"] + " \\\n\t" self.script += "-o " + use_dir + "\n\n" # self.sample_data[sample]["sample_dir"] = sample_dir ############# 2 # Pointing to resulting files in sample structure: if self.params["join_algo"] == "fastq-join": self.sample_data[sample][ "fastq.J"] = sample_dir + "fastqjoin.join.fastq" self.sample_data[sample][ "fastq.F"] = sample_dir + "fastqjoin.un1.fastq" self.sample_data[sample][ "fastq.R"] = sample_dir + "fastqjoin.un2.fastq" elif self.params["join_algo"] == "SeqPrep": # Define the file names for SeqPrep. See qiime documentation raise AssertionExcept("SeqPrep is not yet defined...\n") else: raise AssertionExcept( "You must define a join_algo. Either fastq-join or SeqPrep...\n" ) ############# 3 # Leave space to define concatenation of R and F files: if self.params["join"] == "join_cat": # Define qiime files as readsS (+ joined + catted) raise AssertionExcept("join_cat not defined yet!!!\n") ############# 4 # Putting links to final files if self.params["join"] == "join": #### 1aii. Making soft links (checking each to make sure it does not already exist:) directions = set( self.sample_data[sample].keys()) & directions_to_use for direction in directions: link_name = "".join([ self.links_dir + sample + os.sep, ".".join([sample, direction[6], "fastq"]) ]) if os.path.exists(link_name): self.write_warning( "Link " + link_name + "exists. Will overwrite when script is executed!!!!\n" ) cmd = "ln -sf %s %s" % ( self.sample_data[sample][direction], link_name) self.script += cmd + "\n\n" elif self.params["join"] == "join_cat": # Define what to do if we want join + cat. raise AssertionExcept("join_cat not yet defined!!\n") self.sample_data["qiime.prep_links_dir"] = self.links_dir # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, sample_dir ) # Sees to copying local files to final destination (and other stuff) # self.stamp_dir_files(sample_dir) self.create_low_level_script()
def get_global_Sample_data(self, category): try: return reduce(dict.get, category, self.sample_data) except KeyError: raise AssertionExcept("The Slot %s is not found in sample data" % str(category).replace(",", ""))
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ GFF_dir = get_global_Sample_data(self, ["GFF_dir"]) sample = 'Pan_Genome' # Name of specific script: self.spec_script_name = "_".join([self.step, self.name, sample]) self.script = "" # Make a dir for the RESULTS: sample_dir = self.make_folder_for_sample(sample) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. #use_dir = self.local_start(sample_dir) # Define output filename output_filename = "".join([sample_dir, sample]) #Roary main command self.script += self.get_script_const() self.script += " -f %s \\\n\t" % output_filename self.script += " %s*.gff \n\n" % GFF_dir # Adding the results data set_global_Sample_data(self, ["pan_genome_results_dir"], output_filename) set_global_Sample_data( self, ["presence_absence_matrix"], os.path.join(output_filename, "gene_presence_absence.csv")) set_global_Sample_data( self, ["pan_genome_clustered_proteins"], os.path.join(output_filename, "clustered_proteins")) # Creating the plots if "plot" in self.params.keys(): if "Roary_matrix_plot.py" in os.listdir(self.module_location): self.script += "env %s \\\n" % self.params["env"] self.script += "python %s \\\n\t" % os.path.join( self.module_location, "Roary_matrix_plot.py") if type(self.params["plot"]) == dict: if "format" in self.params["plot"].keys(): self.script += " --format %s \\\n\t" % self.params[ "plot"]["format"] if "virulence_resistance_tag" in self.params.keys(): if self.params["virulence_resistance_tag"] == "VFDB": self.script += " --tag %s \\\n\t" % "Virulence_Resistance.fasta:Virulence" else: self.script += " --tag %s \\\n\t" % self.params[ "virulence_resistance_tag"] if "Clustering_method" in self.params["plot"].keys(): self.script += " -C %s \\\n\t" % self.params["plot"][ "Clustering_method"] self.script += " -O %s \\\n\t" % get_global_Sample_data( self, ["pan_genome_results_dir"]) self.script += " -P %s \n\n" % get_global_Sample_data( self, ["presence_absence_matrix"]) else: raise AssertionExcept( "The file %s is not found in the Roary module directory" % "Roary_matrix_plot.py") # Pan-genome wide association studies using scoary scoary_traits_file = '' gene_presence_absence_file_loc = get_global_Sample_data( self, ["presence_absence_matrix"]) if "scoary" in self.params.keys(): if type(self.params["scoary"]) == dict: if self.params["scoary"]["script_path"] != None: # if the traits file is provided use it if "traits_file" in self.params["scoary"].keys(): scoary_traits_file = self.params["scoary"][ "traits_file"] # Creating the result dir GWAS_dir = self.make_folder_for_sample("GWAS") # if the a metadata file is provided use it to create traits file elif "metadata_file" in self.params["scoary"].keys(): if "Traits_Parser.py" in os.listdir( self.module_location): if "traits_to_pars" in self.params["scoary"].keys( ): # Creating the result dir GWAS_dir = self.make_folder_for_sample("GWAS") self.script += "env %s \\\n" % self.params[ "env"] self.script += "python %s \\\n\t" % os.path.join( self.module_location, "Traits_Parser.py") self.script += " -M %s \\\n\t" % self.params[ "scoary"]["metadata_file"] self.script += " -O %s \\\n\t" % GWAS_dir # This option will create new gene presence absence file with correct samples names and the are shared with the traits file self.script += " -P %s \\\n\t" % gene_presence_absence_file_loc if "metadata_samples_ID_field" in self.params[ "scoary"].keys(): self.script += " --S_MetaData %s \\\n\t" % self.params[ "scoary"]["metadata_samples_ID_field"] self.script += " --Fields_val %s \n\n" % self.params[ "scoary"]["traits_to_pars"] scoary_traits_file = os.path.join( GWAS_dir, 'Traits_file.csv') # The new gene presence absence file is in the GWAS dir and it is the input for scoary gene_presence_absence_file_loc = os.path.join( GWAS_dir, "gene_presence_absence.csv") else: raise AssertionExcept( "The file %s is not found in the Roary module directory" % "Traits_Parser.py") if len(scoary_traits_file) > 0: self.script += "env %s \\\n" % self.params["env"] self.script += "%s \\\n\t" % self.params["scoary"][ "script_path"] self.script += " -o %s \\\n\t" % GWAS_dir self.script += " -g %s \\\n\t" % gene_presence_absence_file_loc self.script += " -t %s \\\n\t" % scoary_traits_file if ("use_cluster_tree" in self.params["scoary"].keys() ) & ("plot" in self.params.keys()): self.script += " -n %s \\\n\t" % os.path.join( get_global_Sample_data( self, ["pan_genome_results_dir"]), "pangenome_matrix.newick") else: self.script += " -u \\\n\t" if "Bonferroni_cutoff" in self.params["scoary"].keys(): self.script += " -c B -p %s \\\n\t" % self.params[ "scoary"]["Bonferroni_cutoff"] elif "BH_cutoff" in self.params["scoary"].keys(): self.script += " -c BH -p %s \\\n\t" % self.params[ "scoary"]["BH_cutoff"] if "permutations" in self.params["scoary"].keys(): self.script += " -e %s \n\n" % self.params[ "scoary"]["permutations"] # Adding the results data set_global_Sample_data(self, ["GWAS_results_dir"], GWAS_dir) self.script += " \n\n" if "Bi-cluster" in self.params.keys(): if "Biclustering.R" in os.listdir(self.module_location): gene_presence_absence_file_loc = get_global_Sample_data( self, ["presence_absence_matrix"]) # Make a dir for the results file: bicluster_results_dir = self.make_folder_for_sample( "Bicluster") #Running the bicluster script self.script += "env %s \\\n" % self.params["env"] self.script += "Rscript %s \\\n\t" % os.path.join( self.module_location, "Biclustering.R") temp_self_script = "" if type(self.params["Bi-cluster"]) == dict: for par in self.params["Bi-cluster"].keys(): if par == "--Roary_Results": self.write_warning( "The '--Roary_Results' parameter in the Roary Bi-clustering analysis is ignored" ) elif par == "-o": self.write_warning( "The '-o' parameter in the Roary Bi-clustering analysis is ignored" ) elif par == "--Annotation": if self.params["Bi-cluster"][par] == "VFDB": if "VFDB_unified_VF_category_clustered.tsv" in os.listdir( self.module_location): temp_self_script +="%s %%s \\\n\t" % par \ % os.path.join(self.module_location,"VFDB_unified_VF_category_clustered.tsv") else: raise AssertionExcept( "The file %s is not found in the Roary module directory" % "VFDB_unified_VF_category_clustered.tsv" ) else: temp_self_script +="%s %%s \\\n\t" % par \ % self.params["Bi-cluster"][par] elif len(par) > 0: if self.params["Bi-cluster"][par] != None: temp_self_script +="%s %%s \\\n\t" % par \ % self.params["Bi-cluster"][par] else: temp_self_script += "%s \\\n\t" % par self.script += "--Roary_Results %s \\\n\t" % gene_presence_absence_file_loc self.script += temp_self_script self.script += "-o %s \\\n\t" % bicluster_results_dir self.script += " \n\n" set_global_Sample_data(self, ["Bicluster_results_dir"], bicluster_results_dir) set_global_Sample_data( self, ["Bicluster_clusters"], os.path.join(bicluster_results_dir, "Bicluster_clusters")) # Run Gecko gene clusters analysis based on the Bi-clustering analysis if "Gecko" in self.params.keys(): if type(self.params["Gecko"]) == dict: if "script_path" in self.params["Gecko"].keys(): if self.params["Gecko"]["script_path"] != None: if "GFF2Gecko3.py" in os.listdir( self.module_location): Bicluster_clusters = get_global_Sample_data( self, ["Bicluster_clusters"]) gene_presence_absence_file_loc = get_global_Sample_data( self, ["presence_absence_matrix"]) # Make a dir for the results file: Gecko_results_dir = self.make_folder_for_sample( "Gecko") #Running the GFF2Gecko3 script self.script += "env %s \\\n" % self.params[ "env"] self.script += "python %s \\\n\t" % os.path.join( self.module_location, "GFF2Gecko3.py") if "-p" in self.params["redir_params"]: self.script += "-P %s \\\n\t" % self.params[ "redir_params"]["-p"] self.script += "-D %s \\\n\t" % GFF_dir self.script += "-C %s \\\n\t" % gene_presence_absence_file_loc self.script += "-B %s \\\n\t" % Bicluster_clusters self.script += "-o %s \n\n" % os.path.join( Gecko_results_dir, "Gecko.cog") self.script += "env %s \\\n" % self.params[ "env"] temp_self_script = "" Gecko_pars = list() for par in self.params["Gecko"].keys(): Gecko_pars.append(par) if par == "-in": self.write_warning( "The '-in' parameter in the Roary Gecko analysis is ignored" ) elif par == "-out": self.write_warning( "The '-out' parameter in the Roary Gecko analysis is ignored" ) elif len(par) > 0: if self.params["Gecko"][ par] != None: temp_self_script +="%s %%s \\\n\t" % par \ % self.params["Gecko"][par] else: temp_self_script += "%s \\\n\t" % par self.script += "%s \\\n\t" % self.params[ "Gecko"]["script_path"] if "-r" not in Gecko_pars: temp_self_script += "-r Reference_clusters \\\n\t" if "-s" not in Gecko_pars: temp_self_script += "-s 2 \\\n\t" if "-d" not in Gecko_pars: temp_self_script += "-d 7 \\\n\t" if "-q" not in Gecko_pars: temp_self_script += "-q 2 \\\n\t" if "-rO" not in Gecko_pars: temp_self_script += "-rO zippedPdfs showFiltered %s \\\n\t" % os.path.join( Gecko_results_dir, "Clusters.zip") else: temp_self_script +="-rO %s %%s \\\n\t" % self.params["Gecko"]["-rO"]\ % os.path.join(Gecko_results_dir ,"Clusters" ) self.script += "-in %s \\\n\t" % os.path.join( Gecko_results_dir, "Gecko.cog") self.script += "-out %s \\\n\t" % os.path.join( Gecko_results_dir, "Gecko.gck") self.script += temp_self_script self.script += " \n\n" set_global_Sample_data( self, ["Gecko_results_dir"], Gecko_results_dir) else: raise AssertionExcept( "The file %s is not found in the Roary module directory" % "GFF2Gecko3.py") else: raise AssertionExcept( "No %s running command found" % "Gecko") else: raise AssertionExcept( "The file %s is not found in the Roary module directory" % "Biclustering.R") for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Store Roary result location: set_Sample_data( self, sample, ["pan_genome_results_dir"], get_global_Sample_data(self, ["pan_genome_results_dir"])) # Wrapping up function. Leave these lines at the end of every iteration: #self.local_finish(use_dir,sample_dir) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def step_sample_initiation(self): """ A place to do initiation stages following setting of sample_data """ for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash if "sam" in self.sample_data[sample]: self.write_warning( "SAM file exists for sample. Double mapping steps?\n", sample) if self.params["mod"] in ["samse"]: try: self.sample_data[sample]["saiS"] self.sample_data[sample]["fastq.S"] except KeyError: raise AssertionExcept( "'samse' requires sai and single-end fatsq files for the sample. Make sure you have a bwa aln step before this step and 'Single' files in the sample file.", sample) if self.params["mod"] in ["sampe"]: try: self.sample_data[sample]["saiF"] self.sample_data[sample]["saiR"] self.sample_data[sample]["fastq.F"] self.sample_data[sample]["fastq.R"] except KeyError: raise AssertionExcept( "'sampe' requires sai and paired-end fatsq files for the sample. Make sure you have a bwa aln step before this step and 'Forward' and 'Reverse' files in the sample file.", sample) # Require either 'scope' or 'ref_index': if "scope" in self.params: # If scope defined, comment if also ref_index exists. if "ref_index" in self.params: raise AssertionExcept( "Both 'scope' and 'ref_index' specified!\n") try: # Loop over samples to set the reference genome: for sample in self.sample_data["samples"]: if self.params["scope"] == "project": # Set project wide reference: self.sample_data[sample][ "reference"] = self.sample_data["bwa_fasta"] elif self.params["scope"] == "sample": # Set per-sample reference: self.sample_data[sample][ "reference"] = self.sample_data[sample][ "bwa_fasta"] else: raise AssertionExcept( "Scope must be either 'sample' or 'project'") except KeyError: raise AssertionExcept( "There is a mismatch between 'scope' and the existing bwa index\n", sample) if "ref_genome" in self.params.keys(): raise AssertionExcept( "ref_genome was passed, and 'scope' was defined. Ignoring ref_genome\n" ) else: # If scope is not defined, require '-x' if not "ref_index" in self.params: raise AssertionExcept( "Neither 'scope' nor 'ref_index' specified.\n") # Storing reference genome for use by downstream steps: if "ref_genome" in self.params.keys(): for sample in self.sample_data["samples"]: # If reference already exists, ignore ref_genome if "reference" in self.sample_data[sample]: self.write_warning( "ref_genome was passed, but a reference already exists. Setting reference to 'ref_genome'\n" ) self.sample_data[sample]["reference"] = self.params[ "ref_genome"] else: self.write_warning( "No reference given. It is highly recommended to give one!\n" )
def step_sample_initiation(self): """ A place to do initiation stages following setting of sample_data """ ########################################## # Require either 'scope' or '--genomeDir': if "scope" in self.params: # If scope defined, comment if also -x exists. if "--genomeDir" in self.params["redir_params"]: raise AssertionExcept( "Both 'scope' and '--genomeDir' specified!\n") # Loop over samples to set the reference genome: for sample in self.sample_data["samples"]: if self.params["scope"] == "project": # Set project wide reference: try: self.sample_data[sample][ "reference"] = self.sample_data["STAR_fasta"] except: raise AssertionExcept( "No reference exists at 'project' scope. Do you have a STAR_builder step defined?" ) elif self.params["scope"] == "sample": # Set per-sample reference: try: self.sample_data[sample][ "reference"] = self.sample_data[sample][ "STAR_fasta"] except: raise AssertionExcept( "No reference exists at 'sample' scope. Do you have a STAR_builder step defined?", sample) else: raise AssertionExcept( "Scope must be either 'sample' or 'project'") if "ref_genome" in self.params.keys(): raise AssertionExcept( "ref_genome was passed, and 'scope' was defined. Resolve!\n" ) else: # If scope is not defined, require '--genomeDir' if not "--genomeDir" in self.params["redir_params"]: raise AssertionExcept( "Neither 'scope' nor '--genomeDir' specified.\n") # Storing reference genome for use by downstream steps: if "ref_genome" in self.params.keys(): for sample in self.sample_data["samples"]: # If reference already exists, ignore ref_genome if "reference" in self.sample_data[sample]: self.write_warning( "ref_genome was passed, but a reference already exists. Setting reference to 'ref_genome'\n" ) self.sample_data[sample]["reference"] = self.params[ "ref_genome"] else: self.write_warning( "No reference given. It is highly recommended to give one!\n" )
def step_sample_initiation_byproject(self): """ A place to do initiation stages following setting of sample_data This set of tests is performed for project-level """ if len(get_File_Type_data(self.params, ["inputs"])) > 0: # Test if the input File_Types exists for inputs in self.params["inputs"].keys(): if get_File_Type_data( self.params["inputs"], [inputs, "File_Type"], None ) == None: #Test if the user specify a File_Type for the input argument raise AssertionExcept( "You mast specify a File_Type argument in the input parameter: %s " % inputs) else: #Test if the File_Type for the input argument exists if get_File_Type_data(self.params["inputs"], [inputs, "scope"]) == "project": if get_File_Type_data( self.params["inputs"], [inputs, "File_Type" ]) not in self.sample_data.keys(): raise AssertionExcept( "The File_Type %s is not found in the PROJECT level \n\t File_Types available are : %%s" % get_File_Type_data(self.params["inputs"], [inputs, "File_Type"]) % self.sample_data.keys()) else: for sample in self.sample_data["samples"]: if get_File_Type_data( self.params["inputs"], [inputs, "File_Type" ]) not in self.sample_data[sample].keys(): raise AssertionExcept( "The File_Type %s is not found in the SAMPLE level [in sample name %%s] \n\t File_Types available are : %%%%s" % get_File_Type_data(self.params["inputs"], [inputs, "File_Type"]) % sample % self.sample_data[sample].keys()) if "del" in self.params["inputs"][inputs].keys(): self.write_warning( "!!! The file/directory in the input File_Type %s will be DELETED at the end of this step!!! " % get_File_Type_data(self.params["inputs"], [inputs, "File_Type"])) if len(get_File_Type_data(self.params, ["outputs"])) > 0: # Test if the output File_Types for outputs in self.params["outputs"].keys(): if get_File_Type_data( self.params["outputs"], [outputs, "File_Type"], None ) != None: #Test if the user specify a File_Type for the output argument if get_File_Type_data(self.params["outputs"], [ outputs, "File_Type" ]) in self.sample_data.keys( ): #Test if the File_Type for the output argument exists if self.sample_data[get_File_Type_data( self.params["outputs"], [outputs, "File_Type"] )] == None: #Test if the File_Type was already defined in the output arguments raise AssertionExcept( "The output File_Type %s in the PROJECT level was defined more the once !!! " % get_File_Type_data(self.params["outputs"], [outputs, "File_Type"])) else: self.write_warning( "The output File_Type %s already exists in the PROJECT level, it's content will be override !!! " % get_File_Type_data(self.params["outputs"], [outputs, "File_Type"])) else: # If the File_Type dose not exists, will generate empty File_Type self.sample_data[get_File_Type_data( self.params["outputs"], [outputs, "File_Type"])] = None if "del" in self.params["outputs"][outputs].keys(): raise AssertionExcept( "Output File_Types cannot be deleted") pass
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ # Each iteration must define the following class variables: # self.spec_script_name # self.script for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # Name of specific script: self.spec_script_name = "_".join([self.step, self.name, sample]) self.script = "" # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) # Define location and prefix for output files: output_prefix = sample + "_STAR_map" # output_prefix = use_dir + output_prefix # Adding sample ID to ID: attribute: if "outSAMattrRGline" in self.params: self.params["redir_params"]["--outSAMattrRGline"] = "ID:{ID} {rest}".format(ID=sample, \ rest=self.params["outSAMattrRGline"]) else: self.params["redir_params"][ "--outSAMattrRGline"] = "ID:{ID}".format(ID=sample) # If using internal index, define it here: if "scope" in self.params: if self.params["scope"] == "sample": self.params["redir_params"][ "--genomeDir"] = self.sample_data[sample]["STAR_index"] else: self.params["redir_params"][ "--genomeDir"] = self.sample_data["STAR_index"] # Get constant part of script: self.script += self.get_script_const() if "fastq.F" in self.sample_data[sample]: self.script += "--readFilesIn %s %s \\\n\t" % ( self.sample_data[sample]["fastq.F"], self.sample_data[sample]["fastq.R"]) elif "fastq.S" in self.sample_data[sample]: self.script += "--readFilesIn %s \\\n\t" % self.sample_data[ sample]["fastq.S"] else: raise AssertionExcept("No fastq files exist for sample!!\n", sample) self.script += "--outFileNamePrefix %s%s. \n\n" % (use_dir, output_prefix) if self.output_type == "SAM": self.sample_data[sample]["sam"] = "%s%s.Aligned.out.sam" % ( sample_dir, output_prefix) self.stamp_file(self.sample_data[sample]["sam"]) elif self.output_type == "BAM": if "Unsorted" in self.bam_types: self.sample_data[sample][ "bam"] = "%s%s.Aligned.out.bam" % (sample_dir, output_prefix) self.sample_data[sample][ "bam_unsorted"] = "%s%s.Aligned.out.bam" % ( sample_dir, output_prefix) self.stamp_file(self.sample_data[sample]["bam_unsorted"]) if "SortedByCoordinate" in self.bam_types: self.sample_data[sample][ "bam"] = "%s%s.Aligned.sortedByCoord.out.bam" % ( sample_dir, output_prefix) self.stamp_file(self.sample_data[sample]["bam"]) else: # None pass if self.wig_type == "bedGraph": if "--outWigStrand" not in self.params[ "redir_params"] or self.params["redir_params"][ "--outWigStrand"] == "Stranded": self.sample_data[sample][ "bdg2_UniqueMultiple"] = "%s%s.Signal.UniqueMultiple.str2.out.bg" % ( sample_dir, output_prefix) self.sample_data[sample][ "bdg2_Unique"] = "%s%s.Signal.Unique.str2.out.bg" % ( sample_dir, output_prefix) self.stamp_file( self.sample_data[sample]["bdg2_UniqueMultiple"]) self.stamp_file(self.sample_data[sample]["bdg2_Unique"]) self.sample_data[sample][ "bdg1_UniqueMultiple"] = "%s%s.Signal.UniqueMultiple.str1.out.bg" % ( sample_dir, output_prefix) self.sample_data[sample][ "bdg1_Unique"] = "%s%s.Signal.Unique.str1.out.bg" % ( sample_dir, output_prefix) self.stamp_file( self.sample_data[sample]["bdg1_UniqueMultiple"]) self.stamp_file(self.sample_data[sample]["bdg1_Unique"]) self.sample_data[sample]["bdg"] = self.sample_data[sample][ "bdg1_UniqueMultiple"] elif self.wig_type == "wiggle": if "--outWigStrand" not in self.params[ "redir_params"] or self.params["redir_params"][ "--outWigStrand"] == "Stranded": self.sample_data[sample][ "wig2_UniqueMultiple"] = "%s%s.Signal.UniqueMultiple.str2.out.wig" % ( sample_dir, output_prefix) self.sample_data[sample][ "wig2_Unique"] = "%s%s.Signal.Unique.str2.out.wig" % ( sample_dir, output_prefix) self.stamp_file( self.sample_data[sample]["wig2_UniqueMultiple"]) self.stamp_file(self.sample_data[sample]["wig2_Unique"]) self.sample_data[sample][ "wig1_UniqueMultiple"] = "%s%s.Signal.UniqueMultiple.str1.out.wig" % ( sample_dir, output_prefix) self.sample_data[sample][ "wig1_Unique"] = "%s%s.Signal.Unique.str1.out.wig" % ( sample_dir, output_prefix) self.stamp_file( self.sample_data[sample]["wig1_UniqueMultiple"]) self.stamp_file(self.sample_data[sample]["wig1_Unique"]) self.sample_data[sample]["wig"] = self.sample_data[sample][ "wig1_UniqueMultiple"] else: pass if "--quantMode" in self.params["redir_params"]: if self.params["redir_params"][ "--quantMode"] == "TranscriptomeSAM": self.sample_data[sample][ "bam_transcriptome"] = "%s%s.Aligned.toTranscriptome.out.bam" % ( sample_dir, output_prefix) if self.params["redir_params"]["--quantMode"] == "GeneCounts": pass # Not supported yet... # Storing name of mapper. might be useful: self.sample_data[sample]["mapper"] = self.get_step_step() # Storing reference genome for use by downstream steps: if "ref_genome" in self.params.keys(): self.sample_data[sample]["reference"] = self.params[ "ref_genome"] # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ # Each iteration must define the following class variables: # self.spec_script_name # self.script if self.params["scope"] == "project": # Name of specific script: self.set_spec_script_name( ) #"_".join([self.step,self.name,self.sample_data["Title"]]) self.script = "" # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(self.base_dir) # Define location and prefix for output files: # output_prefix = sample + "_bowtie2_map" # Get list of reference fasta files from samples, and convert to set, removing duplicates reference_fasta = set([ self.sample_data[sample]["reference"] for sample in self.sample_data["samples"] ]) # If there are more than one reference_fasta, exit. This is really really weird and should not happen if len(reference_fasta) > 1: raise AssertionExcept( "There is more than one reference file for the samples. Weird!!!" ) # Convert set into list and return first, and only, element: reference_fasta = list(reference_fasta)[0] # Get constant part of script: self.script += self.get_script_const() # Reference file: self.script += "-f %s \\\n\t" % reference_fasta # BAM files: for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash self.script += "-b %s \\\n\t" % self.sample_data[sample]["bam"] if self.params["output_type"] == "vcf": self.script += "--vcf %s%s_%s.vcf \n\n" % ( use_dir, self.sample_data["Title"], self.get_step_name()) self.sample_data["vcf"] = "%s%s_%s.vcf" % ( self.base_dir, self.sample_data["Title"], self.get_step_name()) self.sample_data["vcf.source"] = "freebayes" self.stamp_file(self.sample_data["vcf"]) else: # output_type = "gvcf" self.script += "--gvcf %s%s_%s.gvcf \n\n" % ( use_dir, self.sample_data["Title"], self.get_step_name()) self.sample_data["gvcf"] = "%s%s_%s.gvcf" % ( self.base_dir, self.sample_data["Title"], self.get_step_name()) self.sample_data["gvcf.source"] = "freebayes" self.stamp_file(self.sample_data["gvcf"]) # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script() else: for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # Name of specific script: self.set_spec_script_name( sample=True) #"_".join([self.step,self.name,sample]) self.script = "" # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) # Get list of reference fasta files from samples, and convert to set, removing duplicates reference_fasta = self.sample_data[sample]["reference"] # Get constant part of script: self.script += self.get_script_const() # Reference file: self.script += "-f %s \\\n\t" % reference_fasta # BAM files: self.script += "-b %s \\\n\t" % self.sample_data[sample]["bam"] if self.params["output_type"] == "vcf": self.script += "--vcf %s%s_%s.vcf \n\n" % ( use_dir, sample, self.get_step_name()) self.sample_data[sample]["vcf"] = "%s%s_%s.vcf" % ( sample_dir, sample, self.get_step_name()) self.sample_data[sample]["vcf.source"] = "freebayes" self.stamp_file(self.sample_data[sample]["vcf"]) else: # output_type = "gvcf" self.script += "--gvcf %s%s_%s.gvcf \n\n" % ( use_dir, sample, self.get_step_name()) self.sample_data[sample]["gvcf"] = "%s%s_%s.gvcf" % ( sample_dir, sample, self.get_step_name()) self.sample_data[sample]["gvcf.source"] = "freebayes" self.stamp_file(self.sample_data[sample]["gvcf"]) # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, sample_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def build_scripts(self): if self.params["scope"] == "project": # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, self.sample_data["Title"]]) self.script = "" # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(self.base_dir) # Define output filename output_filename = "".join( [self.sample_data["Title"], self.file_tag]) # Define query and db files: # If db is defined by user, set the query to the correct 'fasta2use' # If both nucl and prot appear in blast results if "blast.nucl" in self.sample_data and "blast.prot" in self.sample_data: if "fasta2use" in self.params.keys( ) and self.params["fasta2use"] in ("nucl", "prot"): fasta2use = self.params["fasta2use"] # self.script += "--blast %s \\\n\t" % self.sample_data[sample]["blast"][fasta2use] else: raise AssertionExcept( "Project has both 'nucl' and 'prot' blast results. Select one by specifying the 'fasta2use' parameter." ) elif "blast.nucl" in self.sample_data: fasta2use = "nucl" elif "blast.prot" in self.sample_data: fasta2use = "prot" else: raise AssertionExcept("No BLAST Results defined\n") self.script += self.get_script_const() self.script += "--blast %s \\\n\t" % self.sample_data["blast." + fasta2use] # FASTA Extraction if "extract_fasta" in self.params: try: self.script += "--fasta2extract %s \\\n\t" % self.sample_data[ "fasta." + fasta2use] except keyError: raise AssertionExcept( "In order to extract the fasta sequences, you need to have a project wide fasta file defined with the same type as the blast type." ) self.script += "--output %s\n\n" % os.sep.join( [use_dir, output_filename]) # Store BLAST result file: self.sample_data["blast.parsed"] = "".join( [self.base_dir, output_filename]) self.sample_data["blast.parsed." + fasta2use] = self.sample_data["blast.parsed"] self.stamp_file(self.sample_data["blast.parsed"]) # Wrapping up function. Leave these lines at the end of every iteration: self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script() else: # self.params["scope"]=="sample": for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, sample]) self.script = "" # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) # Define output filename output_filename = "".join([use_dir, sample, self.file_tag]) # Define query and db files: # If db is defined by user, set the query to the correct 'fasta2use' # If both nucl and prot appear in blast results if "blast.nucl" in self.sample_data[ sample] and "blast.prot" in self.sample_data[sample]: if "fasta2use" in self.params.keys( ) and self.params["fasta2use"] in ("nucl", "prot"): fasta2use = self.params["fasta2use"] # self.script += "--blast %s \\\n\t" % self.sample_data[sample]["blast"][fasta2use] else: raise AssertionExcept( "Sample has both 'nucl' and 'prot' blast results. Select one by specifying the 'fasta2use' parameter.", sample) elif "blast.nucl" in self.sample_data[sample]: fasta2use = "nucl" elif "blast.prot" in self.sample_data[sample]: fasta2use = "prot" else: raise AssertionExcept("No BLAST Results defined\n") # Define the actual script: self.script += self.get_script_const() self.script += "--blast %s \\\n\t" % self.sample_data[sample][ "blast." + fasta2use] # FASTA Extraction if "extract_fasta" in self.params: try: self.script += "--fasta2extract %s \\\n\t" % self.sample_data[ sample]["fasta." + fasta2use] except keyError: raise AssertionExcept( "In order to extract the fasta sequences, you need to have a fasta file defined with the same type as the blast type.", sample) self.script += "--output %s\n\n" % output_filename # Store BLAST result file: self.sample_data[sample]["blast.parsed"] = "".join( [sample_dir, sample, self.file_tag]) self.sample_data[sample][ "blast.parsed." + fasta2use] = self.sample_data[sample]["blast.parsed"] self.stamp_file(self.sample_data[sample]["blast.parsed"]) # Wrapping up function. Leave these lines at the end of every iteration: self.local_finish( use_dir, sample_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def step_specific_init(self): self.shell = "csh" # Can be set to "bash" by inheriting instances self.file_tag = "trim_galore.fq" if "--output_dir" in self.params["redir_params"] or "-o" in self.params["redir_params"]: raise AssertionExcept("You should not give output directory\n")
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ # Each iteration must define the following class variables: # self.spec_script_name # self.script if self.params["scope"] == "project": # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, self.sample_data["Title"]]) self.script = "" # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(self.base_dir) # Define location and prefix for output files: # output_prefix = sample + "_bowtie2_map" if self.type == "nucl": try: input_file = self.sample_data["fasta.nucl"] except: raise AssertionExcept( "`nucl` fasta file does not exist at project scope. Did you mean cd-hit instead of cd-hit-est?" ) else: # == "prot" try: input_file = self.sample_data["fasta.prot"] except: raise AssertionExcept( "`prot` fasta file does not exist at project scope. Did you mean cd-hit-est instead of cd-hit?" ) output_prefix = os.path.basename(input_file) # Get constant part of script: self.script += self.get_script_const() self.script += "-i {infn} \\\n\t".format(infn=input_file) self.script += "-o {outdir}{ossep}{outfn} \n\n".format( outdir=use_dir, ossep=os.sep, outfn=output_prefix) self.sample_data["fasta." + self.type] = "{outdir}{ossep}{outfn}".format( outdir=self.base_dir, ossep=os.sep, outfn=output_prefix) self.sample_data["cd_hit." + self.type] = self.sample_data["fasta." + self.type] self.stamp_file(self.sample_data["fasta." + self.type]) # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script() else: for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, sample]) self.script = "" # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) # Define location and prefix for output files: # output_prefix = sample + "_bowtie2_map" if self.type == "nucl": try: input_file = self.sample_data[sample]["fasta.nucl"] except: raise AssertionExcept( "`nucl` fasta file does not exist at project scope. Did you mean cd-hit instead of cd-hit-est?" ) else: # == "prot" try: input_file = self.sample_data[sample]["fasta.prot"] except: raise AssertionExcept( "`prot` fasta file does not exist at project scope. Did you mean cd-hit-est instead of cd-hit?" ) output_prefix = os.path.basename(input_file) # Get constant part of script: self.script += self.get_script_const() self.script += "-i {infn} \\\n\t".format(infn=input_file) self.script += "-o {outdir}{ossep}{outfn} \n\n".format( outdir=use_dir, ossep=os.sep, outfn=output_prefix) self.sample_data[sample][ "fasta." + self.type] = "{outdir}{ossep}{outfn}".format( outdir=sample_dir, ossep=os.sep, outfn=output_prefix) self.sample_data[sample]["cd_hit." + self.type] = self.sample_data[sample][ "fasta." + self.type] self.stamp_file(self.sample_data[sample]["fasta." + self.type]) # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, sample_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ # Each iteration must define the following class variables: # spec_script_name # script if "generate_GFF_dir" in self.params.keys(): #Make a dir for the GFF files: GFF_dir = self.make_folder_for_sample("GFF") self.sample_data["GFF_dir"] = GFF_dir for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Name of specific script: self.spec_script_name = "_".join([self.step, self.name, sample]) self.script = "" # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) # Define output filename output_filename = "".join([use_dir, sample]) self.script += self.get_script_const() if "--proteins VFDB" in self.script: if "Virulence_Resistance.fasta" in os.listdir( self.module_location): self.script = self.script.replace( "--proteins VFDB", "--proteins %s" % os.path.join(self.module_location, "Virulence_Resistance.fasta")) else: raise AssertionExcept( "The file %s is not found in the Prokka module directory" % "Virulence_Resistance.fasta") self.script += "--outdir %s \\\n\t" % use_dir self.script += "--locustag %s \\\n\t" % sample self.script += "--strain %s \\\n\t" % sample self.script += "--prefix %s \\\n\t" % sample self.script += "%s \n\n" % self.sample_data[sample]["fasta.nucl"] if "generate_GFF_dir" in self.params.keys(): self.script += "cp %s %%s \n\n" % os.path.join( sample_dir, sample + ".gff") % GFF_dir # Store Prokka result files: #set_Sample_data(self,sample,["GFF"],os.path.join(sample_dir,sample+".gff")) self.sample_data[sample]["GFF"] = os.path.join( sample_dir, sample + ".gff") self.sample_data[sample]["fasta.nucl"] = os.path.join( sample_dir, sample + ".ffn") self.sample_data[sample]["fasta.prot"] = os.path.join( sample_dir, sample + ".faa") # Wrapping up function. Leave these lines at the end of every iteration: self.local_finish( use_dir, sample_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def step_sample_initiation(self): """ A place to do initiation stages following setting of sample_data """ if "scope" in self.params.keys(): if self.params["scope"] == "project": try: # Is there a mega-assembly? self.sample_data["fasta"]["nucl"] except KeyError: # No. Check if all samples have assemblies: raise AssertionExcept("No project wide assembly!") else: try: # Creating an assembly slot in case it does not exist # This can happen when running quast on a fasta file that was not assembled (=input) self.sample_data["assembly"] except KeyError: self.sample_data["assembly"] = {} if "compare_mode" in self.params.keys(): self.write_warning( "Ignoring 'compare_mode' in project scope") elif self.params["scope"] == "sample": for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Make sure each sample has a ["fasta"]["nucl"] slot try: self.sample_data[sample]["fasta"]["nucl"] # self.sample_data["assembly"] # Removed so that the step can be executed on fasta assembled elsewhere and loaded as fasta from sample file except KeyError: raise AssertionExcept( "You are trying to run QUAST with no assembly.\n", sample) else: try: # Creating an assembly slot in case it does not exist # This can happen when running quast on a fasta file that was not assembled (=input) self.sample_data[sample]["assembly"] except KeyError: self.sample_data[sample]["assembly"] = {} else: raise AssertionExcept( "'scope' must be either 'project' or 'sample'") else: self.write_warning("'scope' not passed. Will try guessing...") try: # Is there a mega-assembly? self.sample_data["fasta"]["nucl"] except KeyError: # No. Check if all samples have assemblies: for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Make sure each sample has a ["fasta"]["nucl"] slot try: self.sample_data[sample]["fasta"]["nucl"] # self.sample_data["assembly"] # Removed so that the step can be executed on fasta assembled elsewhere and loaded as fasta from sample file except KeyError: raise AssertionExcept( "You are trying to run QUAST with no assembly.\n", sample) self.params["scope"] = "sample" else: self.write_warning( "There is a project-wide assembly. Using it.\n") try: # Creating an assembly slot in case it does not exist self.sample_data["assembly"] except KeyError: self.sample_data["assembly"] = {} self.params["scope"] = "project"
def step_specific_init(self): self.shell = "csh" # Can be set to "bash" by inheriting instances if not "genome" in self.params: raise AssertionExcept("You must pass a 'genome' parameter!")
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ # Each iteration must define the following class variables: # self.spec_script_name # self.script for sample in self.sample_data["Controls"].keys(): # Getting list of samples out of Controls dict. # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # Name of specific script: self.spec_script_name = "_".join([self.step,self.name,sample]) self.script = "" # Name of control sample: control = self.sample_data["Controls"][sample] # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) # Defined full path to output filename output_filename = "%s.%s" % (sample, self.file_tag) self.script += self.get_script_const() # Add lines for sample mapping files: self.script += "-t %s \\\n\t" % self.sample_data[sample]["bam"] if not "nocontrol" in self.params.keys(): self.script += "-c %s \\\n\t" % self.sample_data[control]["bam"] # Add output directory self.script += "--name %s \n\n" % output_filename self.script += "--outdir %s \n\n" % use_dir # Storing the output file in samples_data self.sample_data[sample]["macs2_prefix"] = "".join([sample_dir, output_filename]) self.sample_data[sample]["peak_bed"] = "".join([sample_dir, output_filename, "_peaks.bed"]) self.sample_data[sample]["peak_xls"] = "".join([sample_dir, output_filename, "_peaks.xls"]) self.sample_data[sample]["summit_bed"] = "".join([sample_dir, output_filename, "_summits.bed"]) # Set active bed to peak_bed. Maybe let user decide? self.sample_data[sample]["bed"] = self.sample_data[sample]["peak_bed"] self.stamp_file(self.sample_data[sample]["peak_bed"]) self.stamp_file(self.sample_data[sample]["peak_xls"]) self.stamp_file(self.sample_data[sample]["summit_bed"]) # Storing bedgraph files if should exist: if "--bdg" in self.params["redir_params"] or "-B" in self.params["redir_params"]: self.sample_data[sample]["control_lambda"] = "".join([sample_dir, output_filename, "_control_lambda.bdg"]) self.sample_data[sample]["treat_pileup"] = "".join([sample_dir, output_filename, "_treat_pileup.bdg"]) # Saving the treatment pileup bdg file as the main mapping bdg # (saving in mapping because it is a derivation of the mapping data) self.sample_data[sample]["bdg"] = "".join([sample_dir, output_filename, "_treat_pileup.bdg"]) # Saving the control pileup bdg file as the main control bdg self.sample_data[control]["bdg"] = "".join([sample_dir, output_filename, "_control_lambda.bdg"]) # Stamping all bdg files self.stamp_file(self.sample_data[sample]["control_lambda"]) self.stamp_file(self.sample_data[sample]["treat_pileup"]) ############################## # # Add conversion of peak bed to bigbed if "bedToBigBed_path" in self.params.keys(): if not "chrom.sizes" in self.params.keys(): raise AssertionExcept("If bedToBigBed_path is passed, you also must sepcify a 'chrom.sizes' path") out_bed_filename = "%s.cut.bed" % self.sample_data[sample]["bed"] out_bb_filename = "%s.cut.bb" % self.sample_data[sample]["bed"] # Probably better to use the following perl one-liner: # perl -e 'while($line=<>){$line=~s/((?:\S*\s*){3})\.\d*(\s.*)/$1$2/; print $line}' bed_file # This will retain only the int part in column 4 while leaving the rest intact. self.script += """ #### Convert bed to bigbed for UCSC browser if [ -e %(in_bed)s ] then # First, removing final column which has a float instead of an integer: cat \\ %(in_bed)s \\ | cut -f 1-4 \\ > %(out_bed)s # Convert to bb %(exec_path)s \\ %(out_bed)s \\ %(chrom_sizes)s \\ %(out_bb)s fi """ % {"in_bed" : self.sample_data[sample]["bed"], \ "exec_path" : self.params["bedToBigBed_path"], \ "chrom_sizes" : self.params["chrom.sizes"], \ "out_bb" : out_bb_filename, \ "out_bed" : out_bed_filename} self.sample_data[sample]["bb"] = out_bb_filename # Stamping bb file self.stamp_file(self.sample_data[sample]["bb"]) ############################## # # Add extration of peak fasta sequences if "getfasta" in self.params.keys(): try: self.sample_data[sample]["reference"] except KeyError: self.write_warning("In %s: No reference exists, but you asked for a fasta file for the peaks. \n\tIn order to get the file you have to set a reference genome in the mapping step (Bowtie in particular)\n") else: self.script += """ # Extract peaks from BED file to fasta file: if [ -e %(peaks)s ] then %(exec_path)s -fi %(ref_fasta)s \\ -bed %(bed_file)s > %(bed_file)s.fasta fi """ % {"peaks" : self.sample_data[sample]["peak_bed"], \ "exec_path" : self.params["getfasta"], \ "ref_fasta" : self.sample_data[sample]["reference"], \ "bed_file" : self.sample_data[sample]["bed"]} self.sample_data[sample]["peak_fasta"] = "%s.fasta" % self.sample_data[sample]["bed"] self.sample_data[sample]["fasta.nucl"] = self.sample_data[sample]["peak_fasta"] # Stamping bb file self.stamp_file(self.sample_data[sample]["peak_fasta"]) # Wrapping up function. Leave these lines at the end of every iteration: self.local_finish(use_dir,sample_dir) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ if self.params["scope"] == "project": # # Not defined yet # raise AssertionExcept("Assembly for scope 'project' is not defined yet in %s\n") # # See clc_assembl for definition, also in step_sample_initiation() of clc_assembl... # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, self.sample_data["Title"]]) self.script = "" # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(self.base_dir) # Megahit requires that the sample dir not exist! Removing: self.script += "rm -rf %s\n\n" % use_dir out_prefix = self.sample_data["Title"] + self.file_tag self.script += self.get_script_const() f_reads_csl = "" r_reads_csl = "" s_reads_csl = "" for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash if "PE" in self.sample_data[sample]["type"]: f_reads_csl += "%s,\\\n\t\t" % self.sample_data[sample][ "fastq.F"] r_reads_csl += "%s,\\\n\t\t" % self.sample_data[sample][ "fastq.R"] if "SE" in self.sample_data[sample]["type"]: s_reads_csl += "%s,\\\n\t\t" % self.sample_data[sample][ "fastq.S"] if "PE" not in self.sample_data[sample][ "type"] and "SE" not in self.sample_data[sample][ "type"]: raise AssertionExcept( "Strange type configuration for sample\n", sample) # Interlaced reads to treated here. Maybe one day... if f_reads_csl: self.script += "-1 " + f_reads_csl.strip( ",\\\n\t\t") + " \\\n\t" self.script += "-2 " + r_reads_csl.strip( ",\\\n\t\t") + " \\\n\t" if s_reads_csl: self.script += "-r " + s_reads_csl.strip( ",\\\n\t\t") + " \\\n\t" self.script += "--out-dir %s \\\n\t" % use_dir self.script += "--out-prefix %s \n\n" % out_prefix # Store results to fasta and assembly slots: self.sample_data[ "fasta.nucl"] = self.base_dir + out_prefix + ".contigs.fa" self.sample_data[self.get_step_step() + "_contigs"] = self.sample_data["fasta.nucl"] self.sample_data["assembler"] = self.get_step_step() self.stamp_file(self.sample_data[self.get_step_step() + "_contigs"]) # Wrapping up function. Leave these lines at the end of every iteration: self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script() else: # self.params["scope"] == "sample" # Each iteration must define the following class variables: # spec_script_name # script for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, sample]) self.script = "" # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) # Megahit requires that the sample dir not exist! Removing: self.script += "rm -rf %s\n\n" % use_dir out_prefix = sample + self.file_tag self.script += self.get_script_const() self.script += "--out-dir %s \\\n\t" % sample_dir self.script += "--out-prefix %s \\\n\t" % out_prefix if "PE" in self.sample_data[sample]["type"]: self.script += "-1 %s \\\n\t" % self.sample_data[sample][ "fastq.F"] self.script += "-2 %s \n\n" % self.sample_data[sample][ "fastq.R"] elif "SE" in self.sample_data[sample]["type"]: self.script += "-r %s \n\n" % self.sample_data[sample][ "fastq.S"] elif "PE" in self.sample_data[sample][ "type"] and "SE" in self.sample_data[sample][ "type"]: # Mixed!! self.script += "-1 %s \\\n\t" % self.sample_data[sample][ "fastq.F"] self.script += "-2 %s \\\n\t" % self.sample_data[sample][ "fastq.R"] self.script += "-r %s \n\n" % self.sample_data[sample][ "fastq.S"] else: raise AssertionExcept( "Strange type configuration for sample\n", sample) # Store results to fasta and assembly slots: self.sample_data[sample][ "fasta.nucl"] = sample_dir + out_prefix + ".contigs.fa" self.sample_data[sample][ self.get_step_step() + "_contigs"] = self.sample_data[sample]["fasta.nucl"] self.sample_data[sample]["assembler"] = self.get_step_step() self.stamp_file(self.sample_data[sample][self.get_step_step() + "_contigs"]) # Wrapping up function. Leave these lines at the end of every iteration: self.local_finish( use_dir, sample_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()
def build_scripts(self): """ This is the actual script building function Most, if not all, editing should be done here HOWEVER, DON'T FORGET TO CHANGE THE CLASS NAME AND THE FILENAME! """ if self.params["scope"] == "project": # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, self.sample_data["Title"]]) self.script = "" ################# ################## ## ## TODO ## ## 1. Output sample list into file in data/mpileup_varscan ## 2. Add parameter for user to pass mpileup location + arguments ## 3. Create the mpileup command ## 4. pipe the output to varscan. (set script_path to location of varscan) ## 5. pipe varscan output to file location # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(self.base_dir) ### Getting location of reference # Get list of reference fasta files from samples, and convert to set, removing duplicates reference_fasta = set([ self.sample_data[sample]["reference"] for sample in self.sample_data["samples"] ]) # If there are more than one reference_fasta, exit. This is really really weird and should not happen if len(reference_fasta) > 1: raise AssertionExcept( "There is more than one reference file for different samples. Weird!!!" ) # Convert set into list and return first, and only, element: reference_fasta = list(reference_fasta)[0] ### Create file with list of sample names, one per line with open(self.base_dir + "sample_list.txt", "w") as smp_lst: for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash smp_lst.write("%s\n" % sample) # Define output_suffix depending on redir_params: # If either --VCF or -v are passed by user if "--output-vcf" in self.params["redir_params"]: output_suffix = "vcf" else: output_suffix = "unknown" # Get constant part of script: self.script += "%s \\\n\t" % self.params["mpileup_path"] # Reference file: self.script += "-f %s \\\n\t" % reference_fasta # BAM files: for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash self.script += "%s \\\n\t" % self.sample_data[sample]["bam"] # Remove extra stuff from end of script: self.script = self.script.rstrip("\\\n\t") # self.script = self.script.rstrip() self.script += " | \\\n" self.script += "%s \\\n\t" % self.params["script_path"] self.script += self.get_redir_parameters_script() self.script += "--vcf-sample-list %s \\\n\t" % (self.base_dir + "sample_list.txt") self.script += "> %s\n\n" % ( use_dir + ".".join([self.sample_data["Title"], output_suffix])) if output_suffix == "vcf": self.sample_data["vcf"] = ( self.base_dir + ".".join([self.sample_data["Title"], output_suffix])) self.stamp_file(self.sample_data["vcf"]) else: self.sample_data["variants"] = ( self.base_dir + ".".join([self.sample_data["Title"], output_suffix])) self.stamp_file(self.sample_data["variants"]) # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script() else: for sample in self.sample_data[ "samples"]: # Getting list of samples out of samples_hash # Make a dir for the current sample: sample_dir = self.make_folder_for_sample(sample) # Name of specific script: self.spec_script_name = "_".join( [self.step, self.name, sample]) self.script = "" # This line should be left before every new script. It sees to local issues. # Use the dir it returns as the base_dir for this step. use_dir = self.local_start(sample_dir) ################# ################## ## ## TODO ## ## 1. Output sample list into file in data/mpileup_varscan ## 2. Add parameter for user to pass mpileup location + arguments ## 3. Create the mpileup command ## 4. pipe the output to varscan. (set script_path to location of varscan) ## 5. pipe varscan output to file location # Get list of reference fasta files from samples, and convert to set, removing duplicates reference_fasta = self.sample_data[sample]["reference"] ### Create file with list of sample names, one per line with open(use_dir + "sample_list.txt", "w") as smp_lst: smp_lst.write("%s\n" % sample) # Define output_suffix depending on redir_params: # If either --VCF or -v are passed by user if "--output-vcf" in self.params["redir_params"]: output_suffix = "vcf" else: output_suffix = "unknown" # Get constant part of script: self.script += "%s \\\n\t" % self.params["mpileup_path"] # Reference file: self.script += "-f %s \\\n\t" % reference_fasta # BAM file: self.script += "%s \\\n\t" % self.sample_data[sample]["bam"] # Remove extra stuff from end of script: self.script = self.script.rstrip("\\\n\t") # self.script = self.script.rstrip() self.script += " | \\\n" self.script += "%s \\\n\t" % self.params["script_path"] self.script += self.get_redir_parameters_script() self.script += "--vcf-sample-list %s \\\n\t" % ( use_dir + "sample_list.txt") self.script += "> %s%s_%s.%s\n\n" % ( use_dir, sample, self.get_step_name(), output_suffix) if output_suffix == "vcf": self.sample_data[sample]["vcf"] = "%s%s_%s.vcf" % ( use_dir, sample, self.get_step_name()) self.sample_data[sample]["vcf.source"] = "varscan" self.stamp_file(self.sample_data[sample]["vcf"]) else: self.sample_data[sample]["variants"] = "%s%s_%s.%s" % ( use_dir, sample, self.get_step_name(), output_suffix) self.sample_data[sample]["variants.source"] = "varscan" self.stamp_file(self.sample_data[sample]["variants"]) # Move all files from temporary local dir to permanent base_dir self.local_finish( use_dir, self.base_dir ) # Sees to copying local files to final destination (and other stuff) self.create_low_level_script()