def exec_mapping_stage(self): """Executor of the Wedring pipeline.""" if self.quiet: wedr_check_path(self.index + '.*') self.exec_mapping() self.aln_file = (path.join(self._mapper_out, "accepted_hits.bam") if self.mapper == "tophat" else self._mapper_out + ".sam") wedr_check_path(self.aln_file) if wedr_check_program("samtools"): self.exec_samtools() if wedr_check_program("bedtools"): self.exec_bedtools() else: wedr_check_path(self.index + '.*') self.exec_mapping() self.aln_file = (path.join(self._mapper_out, "accepted_hits.bam") if self.mapper == "tophat" else self._mapper_out + ".sam") wedr_check_path(self.aln_file) if wedr_check_program("samtools"): wedr_report("[%s] Processing aligments with SAMtools." % self._out_pref) self.exec_samtools() wedr_report("[%s] SAMtools - Done!." % self._out_pref) if wedr_check_program("bedtools"): wedr_report("[%s] Calculating mapping coverage with BEDTools." % self._out_pref) self.exec_bedtools() wedr_report("[%s] BEDTools - Done!" % self._out_pref) return self
def exec_deseq(self): """Execute the **Wedring**'s differential expression stage. :raises: :class:WedringError """ if not self._qt: wedr_report("Calculating differential expression with DESeq.") outfile = path.join(self._o, "diffexpr.txt") errfile = path.join(self._ld, "diffexpr.log") de_cmd_line = "Rscript --vanilla %s %s %s %s %s" % (wedr_which("diffExprStage.R"), self.wedring_diffexpr_parse_options(), self._cnt_table, self._cnd, outfile) de = BioSoft(de_cmd_line, errfile=errfile) if not self._qt: wedr_report("Command line:\n %s" % de.command) de.run() if 0 != de.return_code != -1: raise WedringError(141, "%s exitted with status %d. See log file '%s' for more details." % (de.program_name, de.return_code, de.errfile)) wedr_clean(de.errfile) # TODO Add verification of the DESeq's output with wedr_check_path() # \_ table (OK), graphics wedr_check_path(outfile) if not self._qt: wedr_report("DESeq - Done!")
def run(self): """Execute *bowtie-build*. :raises: :class:WedringError """ if not self.quiet: wedr_report("[%s] Building Bowtie BW index '%s'." % (self._out_pref, self.index)) self.parse_bb_cmd_line() if not self.quiet: wedr_report("[%s] Command line:\n %s" % (self._out_pref, self._bb_cmd)) wedr_prepare_directory(self.out_dir) wedr_prepare_directory(self.log_dir) outfile = path.join(self.log_dir, self._out_pref + "_build.log") bb = BioSoft(command=self._bb_cmd, outfile=outfile) bb.run() if 0 != bb.return_code != -1: raise WedringError(141, "[%s] %s exitted with status %d. See log file '%s' for more details." % (self._out_pref, bb.program_name, bb.return_code, bb.outfile)) wedr_clean(outfile) if not self.quiet: wedr_report("[%s] BW index build - Done!." % self._out_pref)
def exec_bedtools(self): """Executor of the BEDTools part of the pipeline. :raises: :class:WedringError """ cov_out = self.aln_file.replace(".bam", ".cov") errfile = path.join(self.log_dir, self._out_pref + "_coverage.log") bt = BioSoft(command="bedtools coverage -s -abam %s -b %s" % (self.aln_file, self.annot_file), outfile=cov_out, errfile=errfile) if not self.quiet: wedr_report("[%s] Command line:\n %s" % (self._out_pref, bt.command)) bt.run() if 0 != bt.return_code != -1: raise WedringError(141, "[%s] %s exitted with status %d. See log file '%s' for more details." % (self._out_pref, bt.program_name, bt.return_code, bt.errfile)) wedr_clean(bt.errfile) if wedr_check_path(cov_out): self.cov_file = cov_out
def exec_mapping(self): """Executor of the mapping part of the pipeline. :raises: :class:WedringError """ if not self.quiet: wedr_report("[%s] Mapping reads against reference genome." % self._out_pref) self.parse_mapper_cmd_line() if not self.quiet: wedr_report("[%s] Command line:\n %s" % (self._out_pref, self._mapper_cmd)) errfile = path.join(self.log_dir, self._out_pref + "_mapping.log") mp = BioSoft(command=self._mapper_cmd, errfile=errfile) mp.run() if 0 != mp.return_code != -1: raise WedringError(141, "[%s] %s exitted with status %d. See log file '%s' for more details." % (self._out_pref, mp.program_name, mp.return_code, mp.errfile)) wedr_clean(mp.errfile) if not self.quiet: wedr_report("[%s] Mapping - Done!." % self._out_pref)
def wedring_parse_args(self, arguments): """This method parses the :class:Wedring arguments and make some adjustments. :param arguments: List of arguments, usually command line ones. :type arguments: list :raises: :class:WedringError """ try: if len(arguments) < 1: raise WedringError(133, "Insufficient number of arguments.") except TypeError: raise WedringError(133, "Arguments were not provided.") bin_path = None # path to add to system path group_all = False if "--group-all" in arguments: group_all = True arguments.remove("--group-all") # Lambda function to adjust parameters according to the --group-all # argument value. adjust_param = lambda params: ([param for param in params.split(',') if param != ''] if not group_all else [param]) # Adjusting the value of the Wedring barrier to define wich pipeline # stage will be executed if "--just-indexbuild" in arguments: self._wb = JUST_INDEX arguments.remove("--just-indexbuild") if "--just-map" in arguments: self._wb = JUST_MAP arguments.remove("--just-map") if "--just-counttable" in arguments: self._wb = JUST_TABLE arguments.remove("--just-counttable") if "--just-de" in arguments: self._wb = JUST_DE arguments.remove("--just-de") try: opts = getopt(arguments, "n:o:x:m:r:i:l:1:2:q:a:g:t:c:t:d:p:", ["num-threads=", "out-dir=", "index-dir=", "quiet", "mapper=", "ref-sequence=", "bw-index=", "lib-file=", "pair-mate-1=", "pair-mate-2=", "quals=", "q1=", "q2=", "annot-file=", "coverage-files=", "config-file=", "count-table=", "map-label=", "index-label=", "conditions=", "path="]) if opts[1] != []: raise WedringError(132, "Argument list not supported: %s." % " ".join(opts[1])) for opt, val in opts[0]: if opt in ("-n", "--num-threads"): self._num_threads = int(val) elif opt in ("--quiet"): self._qt = True elif opt in ("-m", "--mapper"): self._m = val elif opt in ("-r", "--ref-sequence"): self._r = val elif opt in ("-i", "--bw-index"): self._i = val elif opt in ("-a", "--annot-file"): self._a = val elif opt in ("-c", "--config-file"): self._c = val elif opt in ("-t", "--count-table"): self._cnt_table = val elif opt == "--index-label": self._il = val elif opt in ("-d", "--conditions"): self._cnd = val elif opt in ("-x", "--index-dir"): self._id = val elif opt in ("-o", "--out-dir"): self._o = val elif opt in ("-l", "--lib-file"): self._l = adjust_param(val) elif opt in ("-1", "--pair-mate-1"): self._1 = adjust_param(val) elif opt in ("-2", "--pair-mate-2"): self._2 = adjust_param(val) elif opt in ("-q", "--quals"): self._q = adjust_param(val) elif opt == "--q1": self._q1 = adjust_param(val) elif opt == "--q2": self._q2 = adjust_param(val) elif opt in ("-g", "--coverage-files"): if ',' not in val: raise WedringError(134, "You must provide a list of coverage files.") else: self._cf = [cf for cf in val.split(',') if cf != ''] elif opt == "--map-label": self._ml = adjust_param(val) elif opt in ("-p", "--path"): bin_path = val except GetoptError as err: raise WedringError(136, "%s." % str(err).capitalize()) if bin_path is not None: environ["PATH"] = path.pathsep.join((environ["PATH"], path)) if self._wb != JUST_INDEX: self._ld = path.join(self._o, "log") if self._a: if not self._qt: wedr_report("Validating GFF file: \'%s\'." % self._a) try: gff_out = write_validated_gff(self._a) except GffFormatError as gffferr: raise WedringError(134, "[%s] %s" % (type(gffferr).__name__, gffferr)) except EnvironmentError as env_err: raise WedringError(env_err.errno, "[%s (%d)] %s%s%s." % (type(env_err).__name__, env_err.errno, env_err.strerror, ': ' if env_err.filename else '', env_err.filename if env_err.filename else '')) if isinstance(gff_out, str): if not self._qt: wedr_report("Now using validated GFF file: \'%s\'." % gff_out) self._a = gff_out if not self._qt: wedr_report("Gff validation - Done!") else: if not self._qt: wedr_report("Gff validation - Done!") if self._wb in (TTL_PIPELINE, JUST_TABLE, JUST_DE): if self._cnd is None: raise WedringError(135, "You must set the experimental conditions.") if self._wb in (TTL_PIPELINE, JUST_MAP): if self._l != [None] and self._q == [None]: self._q += [None] * (len(self._l) - 1) if self._l != [None] and self._ml == [None]: self._ml += [None] * (len(self._l) - 1) if (self._1 != [None] and self._2 != [None] and self._q1 == [None] and self._q1 == [None]): self._q1 += [None] * (len(self._1) - 1) self._q2 += [None] * (len(self._2) - 1) if self._1 != [None] and self._2 != [None] and self._ml == [None]: self._ml += [None] * (len(self._1) - 1) elif self._wb == JUST_DE and self._cnt_table is None: raise WedringError(135, "You must provide the counting table.")
def run(self): """Execute all steps of the **Wedring** pipeline.""" # This method will execute according to the self._wb value: # The value are set after the command line options --just-indexbuild, # --just-map, --just-counttable, --just-de, and the possible values are # defined in the globals TTL_PIPELINE, JUST_INDEX, JUST_MAP, JUST_TABLE # and JUST_DE, which mean: # TTL_PIPELINE -- execute all steps of the pipeline # JUST_INDEX -- just execute the indexing stage # JUST_MAP -- execute the indexing stage (if needed) and the mapping # stage # JUST_TABLE -- just build the count table # JUST_DE -- just execute the differential expression stage if self._wb in (TTL_PIPELINE, JUST_INDEX, JUST_MAP): self.wedring_indexbuilder_parse_params() if self._indexbldr is not None: self._indexbldr.run() self._i = self._indexbldr.index if self._wb != JUST_INDEX: wedr_prepare_directory(self._o) wedr_prepare_directory(self._ld) self.wedring_mapping_parse_params() p = Pool(self._num_threads) self._wedr_list = p.map(WedringMast.exec_mapping_stage, self._wedr_list) if self._wb != JUST_MAP: feats_file = path.join(self._o, "genomic_features.txt") tbl_file = path.join(self._o, "count_table.txt") if not self._qt: wedr_report("Writing genomic features to disk.") write_genomic_features_to_file(self._a, feats_file) if wedr_check_path(feats_file): if not self._qt: wedr_report("Writing genomic features - Done!") cov_f = [wedrmast.cov_file for wedrmast in self._wedr_list] if not self._qt: wedr_report("Writing count table to disk.") write_count_table_to_file(self._a, cov_f, self._cnd, tbl_file) if wedr_check_path(tbl_file): self._cnt_table = tbl_file if not self._qt: wedr_report("Writing count table - Done!") self.exec_deseq() elif self._wb == JUST_TABLE: self.wedring_mapping_parse_params() wedr_prepare_directory(self._o) feats_file = path.join(self._o, "genomic_features.txt") tbl_file = path.join(self._o, "count_table.txt") if not self._qt: wedr_report("Writing genomic features to disk.") write_genomic_features_to_file(self._a, feats_file) if wedr_check_path(feats_file): if not self._qt: wedr_report("Writing genomic features - Done!") cov_f = [wedrmast.cov_file for wedrmast in self._wedr_list] if not self._qt: wedr_report("Writing count table to disk.") write_count_table_to_file(self._a, cov_f, self._cnd, tbl_file) if wedr_check_path(tbl_file): self._cnt_table = tbl_file if not self._qt: wedr_report("Writing count table - Done!") elif self._wb == JUST_DE: wedr_prepare_directory(self._o) wedr_prepare_directory(self._ld) self.exec_deseq()
def exec_samtools(self): """Executor of the SAMtools part of the pipeline. :raises: :class:WedringError """ if self.mapper == "bowtie": if not self.quiet: wedr_report("[%s] Converting SAM file to BAM file." % self._out_pref) sam_in_pref = path.splitext(self.aln_file)[0] bam_out = sam_in_pref + ".bam" errfile = path.join(self.log_dir, self._out_pref + "_view.log") st = BioSoft(command="samtools view -bS -o %s %s.sam" % (bam_out, self._mapper_out), errfile=errfile) if not self.quiet: wedr_report("[%s] Command line:\n %s" % (self._out_pref, st.command)) st.run() if 0 != st.return_code != -1: raise WedringError(141, "[%s] %s exitted with status %d. See log file '%s' for more details." % (self._out_pref, st.program_name, st.return_code, st.errfile)) wedr_clean(st.errfile) wedr_clean(self.aln_file, force=True) self.aln_file = bam_out wedr_check_path(self.aln_file) if not self.quiet: wedr_report("[%s] Sorting BAM file." % self._out_pref) errfile = path.join(self.log_dir, self._out_pref + "_sort.log") st = BioSoft(command = "samtools sort %s %s" % (bam_out, sam_in_pref), errfile =errfile) if not self.quiet: wedr_report("[%s] Command line:\n %s" % (self._out_pref, st.command)) st.run() if 0 != st.return_code != -1: raise WedringError(141, "[%s] %s exitted with status %d. See log file '%s' for more details." % (self._out_pref, st.program_name, st.return_code, st.errfile)) wedr_clean(st.errfile) wedr_check_path(self.aln_file) if not self.quiet: wedr_report("[%s] Indexing BAM file." % self._out_pref) errfile = path.join(self.log_dir, self._out_pref + "_index.log") st = BioSoft(command="samtools index %s" % self.aln_file, errfile=errfile) if not self.quiet: wedr_report("[%s] Command line:\n %s" % (self._out_pref, st.command)) st.run() if 0 != st.return_code != -1: raise WedringError(141, "[%s] %s exitted with status %d. See log file '%s' for more details." % (self._out_pref, st.program_name, st.return_code, st.errfile)) wedr_clean(st.errfile) wedr_check_path(self.aln_file + ".bai")
def parse_tophat_options(self): """Parser for the tophat section of the configuration file. :returns: Additional options of *TopHat*'s command line """ mapper_cmd = "" params = {"bowtie1": "false", "mate_inner_dist": "0", "mate_std_dev": "20", "min_anchor_length": "8", "splice_mismatches": "0", "min_intron_length": "70", "max_intron_length": "500000", "max_insertion_length": "3", "max_insertion_length": "3", "solexa_quals": "false", "solexa_1.3_quals": "false", "color": "false", "num_threads": "1", "integer_quals": "false", "max_multihits": "20", "report_secondary_hits": "false", "report_discordant_pair_alignments": "false", "no_coverage_search": "false", "coverage_search": "false", "microexon_search": "false", "library_type": "fr-unstranded", "n": "2", "genome_read_mismatches": "2", "read_mismatches": "2 ", "bowtie_n": "false", "segment_mismatches": "2", "segment_length": "25", "min_coverage_intron": "50", "max_coverage_intron": "20000", "min_segment_intron": "50", "max_segment_intron": "500000", "keep_tmp": "false", "zpacker": "gzip", "fusion_search": "false", "raw_juncs": "none", "fusion_anchor_length": "20", "fusion_min_dist": "10000000", "fusion_read_mismatches": "2", "fusion_multireads": "2", "fusion_multipairs": "2", "fusion_ignore_chromosomes": "none", "no_novel_juncs": "false", "G": "false", "transcriptome_index": "none", "transcriptome_only": "false", "transcriptome_max_hits": "0", "prefilter_multihits": "false", "insertions": "none", "deletions": "none", "no_novel_indels": "false"} cf_parser = RawConfigParser() cf_parser.read(self.cfg_file) for param, val in cf_parser.items(self.mapper): params[param] = val for param, val in params.iteritems(): if param == "bowtie1" and val == "true": mapper_cmd += " --bowtie1" elif param == "output_dir" and val != "./tophat_out": mapper_cmd += " -o %s" % val elif param == "mate_inner_dist" and val != "0": mapper_cmd += " -r %s" % val elif param == "mate_std_dev" and val != "20": mapper_cmd += " --mate-std-dev %s" % val elif param == "min_anchor_length" and val != "8": mapper_cmd += " -a %s" % val elif param == "splice_mismatches" and val != "0": mapper_cmd += " -m %s" % val elif param == "splice_mismatches" and val != "0": mapper_cmd += " -m %s" % val elif param == "min_intron_length" and val != "70": mapper_cmd += " -i %s" % val elif param == "max_intron_length" and val != "500000": mapper_cmd += " -I %s" % val elif param == "max_insertion_length" and val != "3": mapper_cmd += " --max-insertion-length %s" % val elif param == "max_deletion_length" and val != "3": mapper_cmd += " --max-deletion-length %s" % val elif param == "solexa_quals" and val == "true": mapper_cmd += " --solexa-quals" elif param == "solexa1.3_quals" and val == "true": mapper_cmd += " --solexa1.3-quals" elif param == "color" and val == "true": mapper_cmd += " -C" elif (param == "integer_quals" and params["color"] != "true" and val == "true"): mapper_cmd += " --integer-quals %s" % val elif param == "num_threads" and val != "1": mapper_cmd += " -p %s" % val elif param == "max_multihits" and val != "20": mapper_cmd += " -g %s" % val elif param == "report_secondary_hits" and val == "true": mapper_cmd += " --report-secondary-hits" elif param == "report_discordant_pair_alignments" and val == "true": mapper_cmd += " --report_discordant_pair_alignments" elif param == "no_coverage_search" and val == "true": mapper_cmd += " --no-coverage-search" elif param == "coverage_search"and val == "true": mapper_cmd += " --coverage-search" elif param == "coverage_search" and val == "true": mapper_cmd += " --coverage-search" elif param == "microexon_search" and val == "true": mapper_cmd += " --microexon-search" elif (param == "library_type" and val != "fr-unstranded" and val in ("fr-firststrand","fr-secondstrand")): mapper_cmd += " --library-type %s" % val elif param == "n" and val != "2": mapper_cmd += " -n %s" % val elif param == "genome_read_mismatches" and val != "2": mapper_cmd += " --genome-read-mismatches %s" % val elif param == "read_mismatches" and val != "2": mapper_cmd += " --read-mismatches %s" % val elif param == "bowtie_n" and val == "true": mapper_cmd += " --bowtie-n" elif param == "segment_mismatches" and val != "2": mapper_cmd += " --segment-mismatches %s" % val elif param == "segment_length" and val != "25": mapper_cmd += " --segment-length %s" % val elif param == "min_coverage_intron" and val != "50": mapper_cmd += " --min-coverage-intron %s" % val elif param == "max_coverage_intron" and val != "20000": mapper_cmd += " --max-coverage-intron %s" % val elif param == "min_segment_intron" and val != "50": mapper_cmd += " --min-segment-intron %s" % val elif param == "max_segment_intron" and val != "500000": mapper_cmd += " --min-segment-intron %s" % val elif param == "keep_tmp" and val == "true": mapper_cmd += " --keep-tmp" elif param == "zpacker" and val != "gzip": mapper_cmd += " -z %s" % val elif param == "fusion_search" and val == "true": mapper_cmd += " --fusion-search" if params["raw_juncs"] != "none": mapper_cmd += " -j %s" % params["raw_juncs"] if params["fusion_anchor_length"] != "20": mapper_cmd += (" --fusion-anchor-length %s" % params["fusion_anchor_length"]) if params["fusion_min_dist"] != "10000000": mapper_cmd += (" --fusion-min-dist %s" % params["fusion_min_dist"]) if params["fusion_read_mismatches"] != "2": mapper_cmd += (" --fusion-read-mismatches %s" % params["fusion_read_mismatches"]) if params["fusion_multireads"] != "2": mapper_cmd += (" --fusion-multireads %s" % params["fusion_multireads"]) if params["fusion_multipairs"] != "2": mapper_cmd += (" --fusion-multipairs %s" % params["fusion_multipairs"]) if params["fusion_ignore_chromosomes"] != "none": mapper_cmd += (" --fusion-ignore-chromosomes %s" % params["fusion_ignore_chromosomes"]) elif param == "raw_juncs" and val != "none": mapper_cmd += " -j %s" % val elif param == "no_novel_juncs" and val == "true": mapper_cmd += " --no-novel-juncs" elif param == "G" and val == "true": if self.annot_file != None: mapper_cmd += " -G %s" % self.annot_file else: wedr_report("Ignoring TopHat's option -G/--GTF.") elif param == "transcriptome_index" and val != "none": mapper_cmd += " --transcriptome-index %s" % val elif param == "transcriptome_only" and val == "true": mapper_cmd += " -T" elif param == "transcriptome_max_hits" and val != "0": mapper_cmd += " -x %s" % val elif param == "prefilter_multihits" and val == "true": mapper_cmd += " -M" elif param == "insertions" and val != "none": mapper_cmd += " --insertions %s" % val elif param == "deletions" and val != "none": mapper_cmd += " --deletions %s" % val elif param == "no_novel_indels" and val == "true": mapper_cmd += " --no-novel-indels" return mapper_cmd