def bwa_mem(args, param_dict=None): """ """ def parse_in(args): """ to deal with the following issues: - there may be multiple fastq files - fastq files may be gzipped - fastq file may not locate in the `--rootdir` - paired ended fastq files following advice on this page: http://sourceforge.net/p/bio-bwa/mailman/message/31053122/ """ data_dir = os.path.split(os.path.abspath(args.R1[0]))[0] in_f = [] if len(args.R1) == 1: in_f.append(os.path.join("/data", os.path.split(os.path.abspath(args.R1[0]))[1])) else: if args.R1[0].split('.')[-1] == "gz": in_f.append("'<zcat {}'".format( " ".join([os.path.join("/data", os.path.split(it)[1]) for it in args.R1]))) else: in_f.append("'<cat {}'".format( " ".join([os.path.join("/data", os.path.split(it)[1]) for it in args.R1]))) if args.R2 is not None: if len(args.R2) == 1: in_f.append(os.path.join("/data", os.path.split(os.path.abspath(args.R2[0]))[1])) else: if args.R1[0].split('.')[-1] == "gz": in_f.append("'<zcat {}'".format( " ".join([os.path.join("/data", os.path.split(it)[1]) for it in args.R2]))) else: in_f.append("'<cat {}'".format( " ".join([os.path.join("/data", os.path.split(it)[1]) for it in args.R2]))) return data_dir, " ".join(in_f) data_dir, in_fq = parse_in(args) _out_sam = "> {}".format(file_cfg["aligned"](args)) bwa_cmd = " ".join( ["bwa mem -t {_p} -M".format(_p=args.p), join_params(param_dict), ref_file_cfg[version_cfg["REF_VERSION"]]["fa"], in_fq, _out_sam]) cmd = DOCKER_RUN + \ r""" -v {_data_d}:/data bwa:{_bwa_v} bash -c "{_bwa_c}" """ cmd = cmd.format( _ref_v=version_cfg["REF_VERSION"], _out_d=args.out_dir, _bwa_v=_version, _data_d=data_dir, _bwa_c=bwa_cmd) return cmd, file_cfg["aligned"](args)
def picard_sort(args, param_dict=None): """ doc """ cmd = DOCKER_RUN + \ """ picard:{_v} \ SortSam {param} I={aligned} O={sort} TMP_DIR=/out_dir SORT_ORDER=coordinate""" cmd = cmd.format( _ref_v=version_cfg["REF_VERSION"], _out_d=args.out_dir, _v=_version, aligned=file_cfg["aligned"](args), sort=file_cfg["sorted"](args), param=join_params(param_dict)) return cmd, os.path.join(args.out_dir, file_cfg["sorted"](args))
def picard_dedup(args, param_dict=None): """ doc """ cmd = DOCKER_RUN + \ """ picard:{_v} \ MarkDuplicates {param} I={sort} O={dedup} METRICS_FILE={matrics} CREATE_INDEX=true""" cmd = cmd.format( _v=_version, _ref_v=version_cfg["REF_VERSION"], _out_d=args.out_dir, sort=file_cfg["sorted"](args), param=join_params(param_dict), dedup=file_cfg["dedup"](args), matrics=file_cfg["matrics"](args)) return cmd, [file_cfg["dedup"](args), file_cfg["matrics"](args)]
def gatk_haplotypecaller(args, param_dict=None): """ The HaplotypeCaller is capable of calling SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. In other words, whenever the program encounters a region showing signs of variation, it discards the existing mapping information and completely reassembles the reads in that region. """ cmd = DOCKER_RUN + """gatk:{_v} -T HaplotypeCaller {param} -nct {_p} -R {_R}\ -I {bqsr} --emitRefConfidence GVCF --dbsnp {_dbsnp_vcf} -o {gvcf}""" cmd = cmd.format( _ref_v=version_cfg["REF_VERSION"], _out_d=args.out_dir, param=join_params(param_dict), _p=args.p, _v=_version, _dbsnp_vcf=ref_file_cfg[version_cfg["REF_VERSION"]]["dbsnp"], _R=ref_file_cfg[version_cfg["REF_VERSION"]]["fa"], bqsr=file_cfg["bqsr"](args), gvcf=file_cfg["gvcf"](args)) return cmd, file_cfg["gvcf"](args)
def gatk_printread(args, param_dict=None): """ PrintReads is a generic utility tool for manipulating sequencing data in SAM/BAM format. It can dynamically merge the contents of multiple input BAM files, resulting in merged output sorted in coordinate order. """ cmd = DOCKER_RUN + """ gatk:{_v} -T PrintReads {param} -nct {_p}\ -R {_R} -I {dedup} -BQSR {table} -o {bqsr}""" cmd = cmd.format( _ref_v=version_cfg["REF_VERSION"], _out_d=args.out_dir, param=join_params(param_dict), _p=args.p, _v=_version, _R=ref_file_cfg[version_cfg["REF_VERSION"]]["fa"], dedup=file_cfg["dedup"](args), table=file_cfg["table"](args), bqsr=file_cfg["bqsr"](args) ) return cmd, file_cfg["bqsr"](args)
def bwa_index(args, param_dict=None): """ """ cmd1 = \ """docker create \ -v /ref \ --name {_ref_v} reference:{_ref_v}""".format(_ref_v=version_cfg["REF_VERSION"]) cmd2 = \ """docker run \ --rm \ --volumes-from {_ref_v} \ -w /ref \ bwa:{_bwa_v} \ bwa index {param} {in_f} """.format( in_f=ref_file_cfg[version_cfg["REF_VERSION"]]["fa"], param=join_params(param_dict), _ref_v=version_cfg["REF_VERSION"], _bwa_v=_version) return " && ".join([cmd1, cmd2]), None
def gatk_bqsr(args, param_dict=None): """ Base quality score recalibration (BQSR) is a process in which we apply machine learning to model these errors empirically and adjust the quality scores accordingly. This allows us to get more accurate base qualities, which in turn improves the accuracy of our variant calls. The base recalibration process involves two key steps: first the program builds a model of covariation based on the data and a set of known variants (which you can bootstrap if there is none available for your organism), then it adjusts the base quality scores in the data based on the model. """ cmd = DOCKER_RUN + """ gatk:{_v} -T BaseRecalibrator {param} -nct {_p}\ -R {_R} -I {dedup} -knownSites {_dbsnp_vcf} -o {table}""" cmd = cmd.format( _ref_v=version_cfg["REF_VERSION"], _out_d=args.out_dir, param=join_params(param_dict), _p=args.p, _v=_version, _R=ref_file_cfg[version_cfg["REF_VERSION"]]["fa"], _dbsnp_vcf=ref_file_cfg[version_cfg["REF_VERSION"]]["dbsnp"], dedup=file_cfg["dedup"](args), table=file_cfg["table"](args) ) return cmd, file_cfg["table"](args)