def main(): FORMAT = "%(asctime)-15s %(levelname)s %(module)s.%(name)s.%(funcName)s at %(lineno)d :\n\t%(message)s\n" global logger logger = logging.getLogger() logging.basicConfig(filename='variant_calling.log', format=FORMAT, filemode='w', level=logging.DEBUG) # add a new Handler to print all INFO and above messages to stdout ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) logger.addHandler(ch) parser = argparse.ArgumentParser(description=str( "This script requires you have the following dependencies:\n" + "Samtools: \"samtools\" in your path\n" + "Java: \"java\" in your path\n" + "Picard-Tools: env var \"$PICARD_HOME\" with the path to Picard-Tools's bin\n" + "STAR: \"STAR\" in your path\n" + "GATK: env var \"$GATK_HOME\" with the path to GATK's bin\n"), epilog="", formatter_class=argparse. RawTextHelpFormatter) parser.add_argument('--st_fa', '--supertranscript_fasta', dest="st_fa", type=str, required=True, help="Path to the SuperTranscripts fasta file.") parser.add_argument('--st_gtf', '--supertranscript_gtf', dest="st_gtf", type=str, required=True, help="Path to the SuperTranscript gtf file.") group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-p', '--paired', dest="paired_reads", type=str, nargs=2, help="Pair of paired ends read files.") group.add_argument('-s', '--single', dest="single_reads", type=str, help="Single reads file.") parser.add_argument( '-o', '--output', dest="out_path", type=str, required=True, help="Path to the folder where to generate the output.") parser.add_argument( '-l', '--sjdbOverhang', dest="sjdbOverhang", default=150, type=int, help="Size of the reads (used for STAR --sjdbOverhang). default=150") parser.add_argument( '-t', '--threads', dest="nthreads", type=str, default="4", help="Number of threads to use for tools that are multithreaded.") parser.add_argument( '-m', '--maxram', dest="maxram", type=str, default="50000000000", help= "Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value)." ) args = parser.parse_args() PICARD_HOME = os.getenv("PICARD_HOME") if not PICARD_HOME: exit("Error, missing path to Picard-Tools in $PICARD_HOME.") GATK_HOME = os.getenv("GATK_HOME") if not GATK_HOME: exit("Error, missing path to GATK in $GATK.") # get real paths before changing working directory in case they are relative paths if args.paired_reads: reads_paths = [os.path.realpath(f) for f in args.paired_reads] else: reads_paths = [os.path.realpath(args.single_reads)] st_fa_path = os.path.realpath(args.st_fa) st_gtf_path = os.path.realpath(args.st_gtf) # check if output directory exists, if not create real_path = os.path.realpath(args.out_path) if not os.path.isdir(real_path): os.makedirs(real_path) # move to output folder os.chdir(real_path) checkpoint_dir = os.path.abspath( os.path.basename(st_fa_path)) + ".gatk_chkpts" pipeliner = Pipeliner.Pipeliner(checkpoint_dir) # generate supertranscript index logger.info("Generating SuperTranscript index.") pipeliner.add_commands([ Pipeliner.Command("samtools faidx {}".format(st_fa_path), "samtools_faidx_st.ok") ]) pipeliner.run() # generate supertranscript Picard dictionary logger.info("Generating Picard dictionary.") dict_file = re.sub("\.[^\.]+$", ".dict", st_fa_path) if os.path.isfile(dict_file): open(checkpoint_dir + "/picard_dict_st.ok", 'a').close() else: pipeliner.add_commands([ Pipeliner.Command( "java -jar " + PICARD_HOME + "/picard.jar" + " CreateSequenceDictionary R=" + st_fa_path + " O=" + dict_file + " VALIDATION_STRINGENCY=LENIENT ", "picard_dict_st.ok") ]) pipeliner.run() # generate genome folder for STAR's first pass logger.info("Generating genome folder for STAR") star_genome_generate_cmd = str( "STAR --runThreadN " + args.nthreads + " --outSAMmapqUnique 60" + " --runMode genomeGenerate" + " --genomeDir star_genome_idx " + " --genomeFastaFiles {} ".format(st_fa_path) + " --sjdbGTFfile {} ".format(st_gtf_path) + " --sjdbOverhang {} ".format(args.sjdbOverhang) + " --limitGenomeGenerateRAM {}".format(args.maxram)) pipeliner.add_commands([ Pipeliner.Command("mkdir star_genome_idx", "mkdir_star_genome_idx.ok"), Pipeliner.Command(star_genome_generate_cmd, "star_genome_generate.ok") ]) pipeliner.run() # run STAR's alignment logger.info("Running STAR alignment.") cmd = str("STAR --runThreadN " + args.nthreads + " --genomeDir star_genome_idx " + " --runMode alignReads " + " --twopassMode Basic " + " --alignSJDBoverhangMin 10 " + " --outSAMmapqUnique 60" + " --outSAMtype BAM SortedByCoordinate " + " --limitBAMsortRAM {} ".format(args.maxram) + " --readFilesIn " + " ".join(reads_paths)) if re.search("\.gz$", reads_paths[0]): cmd += " --readFilesCommand 'gunzip -c' " pipeliner.add_commands([Pipeliner.Command(cmd, "star_aln.ok")]) pipeliner.run() # clean and convert sam file with Picard-Tools logger.info("Cleaning and Converting sam file with Picard-Tools.") pipeliner.add_commands([ Pipeliner.Command( "java -jar " + PICARD_HOME + "/picard.jar " + " AddOrReplaceReadGroups " + "I=Aligned.sortedByCoord.out.bam " + "O=rg_added_sorted.bam " + " VALIDATION_STRINGENCY=SILENT " + "SO=coordinate RGID=id RGLB=library RGPL=platform RGPU=machine RGSM=sample", "add_read_groups.ok"), Pipeliner.Command( "java -jar " + PICARD_HOME + "/picard.jar " + " MarkDuplicates " + "I=rg_added_sorted.bam O=dedupped.bam " + "CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics", "mark_dups.ok"), Pipeliner.Command("java -jar " + PICARD_HOME + "/picard.jar ValidateSamFile " + "I=dedupped.bam " + "IGNORE_WARNINGS=true " + "MAX_OUTPUT=100000 " + "IGNORE=MATE_NOT_FOUND " + "O=dedupped.bam.validation", "bam_validate.ok", ignore_error=True), Pipeliner.Command( UTILDIR + "/clean_bam.pl dedupped.bam dedupped.bam.validation dedupped.valid.bam", "make_valid_dedupped_bam.ok"), # the option -U ALLOW_N_CIGAR_READS is not required in GATK 4 # the option -RMQF 255 -RMQT 60 : use --outSAMmapqUnique 60 in STAR : https://software.broadinstitute.org/gatk/blog?id=4285 # Cufflinks requires a MAPQ score of 255 so that will need to be changed if cufflinks is used for downstream analysis # ReassignOneMappingQuality read filter reassigns all good alignments to the default value of 60. #" -RF MappingQualityAvailableReadFilter --read-validation-stringency LENIENT", Pipeliner.Command( "java -jar " + GATK_HOME + "/gatk-package-4.0.1.2-local.jar " + " SplitNCigarReads -R " + st_fa_path + " -I dedupped.valid.bam -O splitNCigar.bam " + " --read-validation-stringency LENIENT", "splitNCigarReads.ok") ]) pipeliner.run() # do the actual variant calling logger.info("Variant Calling using Haplotype Caller.") pipeliner.add_commands([ Pipeliner.Command( "java -jar " + GATK_HOME + "/gatk-package-4.0.1.2-local.jar " + "HaplotypeCaller -R " + st_fa_path + " -I ./splitNCigar.bam --dont-use-soft-clipped-bases true -stand-call-conf 20 -O output.vcf", "haplotypecaller.ok") ]) pipeliner.run() # do some basic filtering logger.info("Doing some basic filtering of vcf.") pipeliner.add_commands([ Pipeliner.Command( "java -jar " + GATK_HOME + "/gatk-package-4.0.1.2-local.jar " + " VariantFiltration -R " + st_fa_path + " -V output.vcf -window 35 -cluster 3 " + "--filter-name FS -filter \"FS > 30.0\" " + "--filter-name QD -filter \"QD < 2.0\" -O filtered_output.vcf", "variant_filt.ok") ]) pipeliner.run() logger.info("Done!")
def main(): FORMAT = "%(asctime)-15s %(levelname)s %(module)s.%(name)s.%(funcName)s at %(lineno)d :\n\t%(message)s\n" global logger logger = logging.getLogger() logging.basicConfig( filename="variant_calling.log", format=FORMAT, filemode="w", level=logging.DEBUG ) # add a new Handler to print all INFO and above messages to stdout ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) logger.addHandler(ch) parser = argparse.ArgumentParser( description=str( "This script requires you have the following dependencies:\n" + 'Samtools: "samtools" in your path\n' + 'Java: "java" in your path\n' + 'Picard-Tools: env var "$PICARD_HOME" with the path to Picard-Tools\'s bin\n' + 'STAR: "STAR" in your path\n' + 'GATK: env var "$GATK_HOME" with the path to GATK\'s bin\n' ), epilog="", formatter_class=argparse.RawTextHelpFormatter, ) parser.add_argument( "--st_fa", "--supertranscript_fasta", dest="st_fa", type=str, required=True, help="Path to the SuperTranscripts fasta file.", ) parser.add_argument( "--st_gtf", "--supertranscript_gtf", dest="st_gtf", type=str, required=True, help="Path to the SuperTranscript gtf file.", ) group = parser.add_mutually_exclusive_group(required=True) group.add_argument( "-p", "--paired", dest="paired_reads", type=str, nargs=2, help="Pair of paired ends read files.", ) group.add_argument( "-s", "--single", dest="single_reads", type=str, help="Single reads file." ) group.add_argument( "-S", "--samples_file", dest="samples_file", type=str, help="Trinity samples file (fmt: condition_name replicate_name /path/to/reads_1.fq /path/to/reads_2.fq (tab-delimited, single sample per line))", ) parser.add_argument( "-o", "--output", dest="out_path", type=str, required=True, help="Path to the folder where to generate the output.", ) parser.add_argument( "-l", "--sjdbOverhang", dest="sjdbOverhang", default=150, type=int, help="Size of the reads (used for STAR --sjdbOverhang). default=150", ) parser.add_argument( "-t", "--threads", dest="nthreads", type=str, default="4", help="Number of threads to use for tools that are multithreaded.", ) parser.add_argument( "-m", "--maxram", dest="maxram", type=str, default="50000000000", help="Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value).", ) parser.add_argument("--rg_id", default="id", help="bam RGID field value") parser.add_argument("--rg_sample", default="sample", help="bam RGSM value") parser.add_argument( "--STAR_genomeGenerate_opts", type=str, default="", help="options to pass through to STAR's genomeGenerate function", ) args = parser.parse_args() PICARD_HOME = os.getenv("PICARD_HOME") if not PICARD_HOME: exit("Error, missing path to Picard-Tools in $PICARD_HOME.") GATK_HOME = os.getenv("GATK_HOME") if not GATK_HOME: exit("Error, missing path to GATK in $GATK.") # identify gatk4_jar file gatk4_jar = glob.glob(os.path.join(GATK_HOME, "gatk-package-4.*-local.jar")) if len(gatk4_jar) != 1: raise RuntimeError( "Error, cannot locate single gatk-package-4.*-local.jar in {}".format( GATK_HOME ) ) gatk_path = os.path.abspath(gatk4_jar[0]) # get real paths before changing working directory in case they are relative paths if args.paired_reads: reads_paths = [os.path.realpath(f) for f in args.paired_reads] elif args.single_reads: reads_paths = [os.path.realpath(args.single_reads)] elif args.samples_file: left_fq_list = list() right_fq_list = list() with open(args.samples_file) as fh: for line in fh: line = line.rstrip() if not re.match("\w", line): continue fields = line.split("\t") left_fq = fields[2] left_fq_list.append(os.path.realpath(left_fq)) if len(fields) > 3: right_fq = fields[3] right_fq_list.append(os.path.realpath(right_fq)) reads_paths = [",".join(left_fq_list)] if right_fq_list: reads_paths.append(",".join(right_fq_list)) else: raise RuntimeError("no reads specified") # should never get here st_fa_path = os.path.realpath(args.st_fa) st_gtf_path = os.path.realpath(args.st_gtf) # check if output directory exists, if not create real_path = os.path.realpath(args.out_path) if not os.path.isdir(real_path): os.makedirs(real_path) # move to output folder os.chdir(real_path) checkpoint_dir = os.path.abspath(os.path.basename(st_fa_path)) + ".gatk_chkpts" pipeliner = Pipeliner.Pipeliner(checkpoint_dir) # generate supertranscript index logger.info("Generating SuperTranscript index.") pipeliner.add_commands( [ Pipeliner.Command( "samtools faidx {}".format(st_fa_path), "samtools_faidx_st.ok" ) ] ) pipeliner.run() # generate supertranscript Picard dictionary logger.info("Generating Picard dictionary.") dict_file = re.sub("\.[^\.]+$", ".dict", st_fa_path) if os.path.isfile(dict_file): open(checkpoint_dir + "/picard_dict_st.ok", "a").close() else: pipeliner.add_commands( [ Pipeliner.Command( "java -jar " + PICARD_HOME + "/picard.jar" + " CreateSequenceDictionary R=" + st_fa_path + " O=" + dict_file + " VALIDATION_STRINGENCY=LENIENT ", "picard_dict_st.ok", ) ] ) pipeliner.run() # generate genome folder for STAR's first pass logger.info("Generating genome folder for STAR") # from Alex D.: # scale down the --genomeSAindexNbases parameter as log2(GenomeLength)/2 - 1 genomeSAindexNbases = int(math.log(os.path.getsize(st_fa_path)) / math.log(2) / 2) star_genome_generate_cmd = str( "STAR --runThreadN " + args.nthreads + " --runMode genomeGenerate" + " --genomeDir star_genome_idx " + " --genomeFastaFiles {} ".format(st_fa_path) + " --genomeSAindexNbases {} ".format(genomeSAindexNbases) + " --sjdbGTFfile {} ".format(st_gtf_path) + " --sjdbOverhang {} ".format(args.sjdbOverhang) + " --limitGenomeGenerateRAM {}".format(args.maxram) + " {} ".format(args.STAR_genomeGenerate_opts) ) pipeliner.add_commands( [ Pipeliner.Command("mkdir star_genome_idx", "mkdir_star_genome_idx.ok"), Pipeliner.Command(star_genome_generate_cmd, "star_genome_generate.ok"), ] ) pipeliner.run() # run STAR's alignment logger.info("Running STAR alignment.") cmd = str( "STAR --runThreadN " + args.nthreads + " --genomeDir star_genome_idx " + " --runMode alignReads " + " --twopassMode Basic " + " --alignSJDBoverhangMin 10 " + " --outSAMtype BAM SortedByCoordinate " + " --limitBAMsortRAM {} ".format(args.maxram) + " --readFilesIn " + " ".join(reads_paths) ) if re.search("\.gz$", reads_paths[0]): cmd += " --readFilesCommand 'gunzip -c' " pipeliner.add_commands([Pipeliner.Command(cmd, "star_aln.ok")]) pipeliner.run() ## ## GATK settings based on best practices: ## https://software.broadinstitute.org/gatk/documentation/article.php?id=3891 ## # clean and convert sam file with Picard-Tools logger.info("Cleaning and Converting sam file with Picard-Tools.") pipeliner.add_commands( [ Pipeliner.Command( "java -jar " + PICARD_HOME + "/picard.jar " + " AddOrReplaceReadGroups " + "I=Aligned.sortedByCoord.out.bam " + "O=rg_added_sorted.bam " + " VALIDATION_STRINGENCY=SILENT " + "SO=coordinate RGID={} RGLB=library RGPL=platform RGPU=machine RGSM={}".format( args.rg_id, args.rg_sample ), "add_read_groups.ok", ), Pipeliner.Command( "java -jar " + PICARD_HOME + "/picard.jar " + " MarkDuplicates " + "I=rg_added_sorted.bam O=dedupped.bam " + "CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics", "mark_dups.ok", ), Pipeliner.Command( "java -jar " + PICARD_HOME + "/picard.jar ValidateSamFile " + "I=dedupped.bam " + "IGNORE_WARNINGS=true " + "MAX_OUTPUT=100000 " + "IGNORE=MATE_NOT_FOUND " + "O=dedupped.bam.validation", "bam_validate.ok", ignore_error=True, ), Pipeliner.Command( UTILDIR + "/clean_bam.pl dedupped.bam dedupped.bam.validation dedupped.valid.bam", "make_valid_dedupped_bam.ok", ), Pipeliner.Command( "java -jar " + gatk_path + " SplitNCigarReads -R " + st_fa_path + " -I dedupped.valid.bam -O splitNCigar.bam " + " --read-validation-stringency LENIENT", "splitNCigarReads.ok", ), ] ) pipeliner.run() # do the actual variant calling logger.info("Variant Calling using Haplotype Caller.") pipeliner.add_commands( [ Pipeliner.Command( "java -jar " + gatk_path + " HaplotypeCaller -R " + st_fa_path + " -I ./splitNCigar.bam -dont-use-soft-clipped-bases -stand-call-conf 20.0 -O output.vcf", "haplotypecaller.ok", ) ] ) pipeliner.run() # do some basic filtering logger.info("Doing some basic filtering of vcf.") pipeliner.add_commands( [ Pipeliner.Command( "java -jar " + gatk_path + " VariantFiltration -R " + st_fa_path + " -V output.vcf -window 35 -cluster 3 " + '--filter-name FS -filter "FS > 30.0" ' + '--filter-name QD -filter "QD < 2.0" -O filtered_output.vcf', "variant_filt.ok", ) ] ) pipeliner.run() logger.info("Done!")
def main(): FORMAT = "%(asctime)-15s %(levelname)s %(module)s.%(name)s.%(funcName)s at %(lineno)d :\n\t%(message)s\n" global logger logger = logging.getLogger() logging.basicConfig(filename='variant_calling.log', format=FORMAT, filemode='w', level=logging.DEBUG) # add a new Handler to print all INFO and above messages to stdout ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) logger.addHandler(ch) parser = argparse.ArgumentParser(description=str( "This script requires you have the following dependencies:\n" + "Samtools: \"samtools\" in your path\n" + "Java: \"java\" in your path\n" + "Picard-Tools: env var \"$PICARD_HOME\" with the path to Picard-Tools's bin\n" + "STAR: \"STAR\" in your path\n" + "GATK: env var \"$GATK_HOME\" with the path to GATK's bin\n"), epilog="", formatter_class=argparse. RawTextHelpFormatter) parser.add_argument('--st_fa', '--supertranscript_fasta', dest="st_fa", type=str, required=True, help="Path to the SuperTranscripts fasta file.") parser.add_argument('--st_gtf', '--supertranscript_gtf', dest="st_gtf", type=str, required=True, help="Path to the SuperTranscript gtf file.") group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-p', '--paired', dest="paired_reads", type=str, nargs=2, help="Pair of paired ends read files.") group.add_argument('-s', '--single', dest="single_reads", type=str, help="Single reads file.") group.add_argument( "-S", "--samples_file", dest="samples_file", type=str, help= "Trinity samples file (fmt: condition_name replicate_name /path/to/reads_1.fq /path/to/reads_2.fq (tab-delimited, single sample per line))" ) parser.add_argument( '-o', '--output', dest="out_path", type=str, required=True, help="Path to the folder where to generate the output.") parser.add_argument( '-l', '--sjdbOverhang', dest="sjdbOverhang", default=150, type=int, help="Size of the reads (used for STAR --sjdbOverhang). default=150") parser.add_argument( '-t', '--threads', dest="nthreads", type=str, default="4", help="Number of threads to use for tools that are multithreaded.") parser.add_argument( '-m', '--maxram', dest="maxram", type=str, default="50000000000", help= "Maximum amount of RAM allowed for STAR's genome generation step (only change if you get an error from STAR complaining about this value)." ) parser.add_argument( "--STAR_genomeGenerate_opts", type=str, default="", help="options to pass through to STAR's genomeGenerate function") args = parser.parse_args() PICARD_HOME = os.getenv("PICARD_HOME") if not PICARD_HOME: exit("Error, missing path to Picard-Tools in $PICARD_HOME.") GATK_HOME = os.getenv("GATK_HOME") if not GATK_HOME: exit("Error, missing path to GATK in $GATK.") # get real paths before changing working directory in case they are relative paths if args.paired_reads: reads_paths = [os.path.realpath(f) for f in args.paired_reads] elif args.single_reads: reads_paths = [os.path.realpath(args.single_reads)] elif args.samples_file: left_fq_list = list() right_fq_list = list() with open(args.samples_file) as fh: for line in fh: line = line.rstrip() if not re.match("\w", line): continue fields = line.split("\t") left_fq = fields[2] left_fq_list.append(os.path.realpath(left_fq)) if len(fields) > 3: right_fq = fields[3] right_fq_list.append(os.path.realpath(right_fq)) reads_paths = [",".join(left_fq_list)] if right_fq_list: reads_paths.append(",".join(right_fq_list)) else: raise RuntimeError("no reads specified") # should never get here st_fa_path = os.path.realpath(args.st_fa) st_gtf_path = os.path.realpath(args.st_gtf) # check if output directory exists, if not create real_path = os.path.realpath(args.out_path) if not os.path.isdir(real_path): os.makedirs(real_path) # move to output folder os.chdir(real_path) checkpoint_dir = os.path.abspath( os.path.basename(st_fa_path)) + ".gatk_chkpts" pipeliner = Pipeliner.Pipeliner(checkpoint_dir) # generate supertranscript index logger.info("Generating SuperTranscript index.") pipeliner.add_commands([ Pipeliner.Command("samtools faidx {}".format(st_fa_path), "samtools_faidx_st.ok") ]) pipeliner.run() # generate supertranscript Picard dictionary logger.info("Generating Picard dictionary.") dict_file = re.sub("\.[^\.]+$", ".dict", st_fa_path) if os.path.isfile(dict_file): open(checkpoint_dir + "/picard_dict_st.ok", 'a').close() else: pipeliner.add_commands([ Pipeliner.Command( "java -jar " + PICARD_HOME + "/picard.jar" + " CreateSequenceDictionary R=" + st_fa_path + " O=" + dict_file + " VALIDATION_STRINGENCY=LENIENT ", "picard_dict_st.ok") ]) pipeliner.run() # generate genome folder for STAR's first pass logger.info("Generating genome folder for STAR") star_genome_generate_cmd = str( "STAR --runThreadN " + args.nthreads + " --runMode genomeGenerate" + " --genomeDir star_genome_idx " + " --genomeFastaFiles {} ".format(st_fa_path) + " --genomeSAindexNbases 8 " + # as per A. Dobin " --sjdbGTFfile {} ".format(st_gtf_path) + " --sjdbOverhang {} ".format(args.sjdbOverhang) + " --limitGenomeGenerateRAM {}".format(args.maxram) + " {} ".format(args.STAR_genomeGenerate_opts)) pipeliner.add_commands([ Pipeliner.Command("mkdir star_genome_idx", "mkdir_star_genome_idx.ok"), Pipeliner.Command(star_genome_generate_cmd, "star_genome_generate.ok") ]) pipeliner.run() # run STAR's alignment logger.info("Running STAR alignment.") cmd = str("STAR --runThreadN " + args.nthreads + " --genomeDir star_genome_idx " + " --runMode alignReads " + " --twopassMode Basic " + " --alignSJDBoverhangMin 10 " + " --outSAMtype BAM SortedByCoordinate " + " --limitBAMsortRAM {} ".format(args.maxram) + " --readFilesIn " + " ".join(reads_paths)) if re.search("\.gz$", reads_paths[0]): cmd += " --readFilesCommand 'gunzip -c' " pipeliner.add_commands([Pipeliner.Command(cmd, "star_aln.ok")]) pipeliner.run() # clean and convert sam file with Picard-Tools logger.info("Cleaning and Converting sam file with Picard-Tools.") pipeliner.add_commands([ Pipeliner.Command( "java -jar " + PICARD_HOME + "/picard.jar " + " AddOrReplaceReadGroups " + "I=Aligned.sortedByCoord.out.bam " + "O=rg_added_sorted.bam " + " VALIDATION_STRINGENCY=SILENT " + "SO=coordinate RGID=id RGLB=library RGPL=platform RGPU=machine RGSM=sample", "add_read_groups.ok"), Pipeliner.Command( "java -jar " + PICARD_HOME + "/picard.jar " + " MarkDuplicates " + "I=rg_added_sorted.bam O=dedupped.bam " + "CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT M=output.metrics", "mark_dups.ok"), Pipeliner.Command("java -jar " + PICARD_HOME + "/picard.jar ValidateSamFile " + "I=dedupped.bam " + "IGNORE_WARNINGS=true " + "MAX_OUTPUT=100000 " + "IGNORE=MATE_NOT_FOUND " + "O=dedupped.bam.validation", "bam_validate.ok", ignore_error=True), Pipeliner.Command( UTILDIR + "/clean_bam.pl dedupped.bam dedupped.bam.validation dedupped.valid.bam", "make_valid_dedupped_bam.ok"), Pipeliner.Command( "java -jar " + GATK_HOME + "/GenomeAnalysisTK.jar " + "-T SplitNCigarReads -R " + st_fa_path + " -I dedupped.valid.bam -o splitNCigar.bam " + " -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS --validation_strictness LENIENT", "splitNCigarReads.ok") ]) pipeliner.run() # do the actual variant calling logger.info("Variant Calling using Haplotype Caller.") pipeliner.add_commands([ Pipeliner.Command( "java -jar " + GATK_HOME + "/GenomeAnalysisTK.jar " + "-T HaplotypeCaller -R " + st_fa_path + " -I ./splitNCigar.bam -dontUseSoftClippedBases -stand_call_conf 20.0 -o output.vcf", "haplotypecaller.ok") ]) pipeliner.run() # do some basic filtering logger.info("Doing some basic filtering of vcf.") pipeliner.add_commands([ Pipeliner.Command( "java -jar " + GATK_HOME + "/GenomeAnalysisTK.jar " + "-T VariantFiltration -R " + st_fa_path + " -V output.vcf -window 35 -cluster 3 " + "-filterName FS -filter \"FS > 30.0\" " + "-filterName QD -filter \"QD < 2.0\" -o filtered_output.vcf", "variant_filt.ok") ]) pipeliner.run() logger.info("Done!")
def main(): parser = argparse.ArgumentParser( description="capture gene to intron usage stats", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) ctat_genome_lib = os.environ.get("CTAT_GENOME_LIB", None) parser.add_argument( "--ctat_genome_lib", dest="ctat_genome_lib", type=str, required=False, default=ctat_genome_lib, help="ctat genome lib build dir", ) parser.add_argument( "--SJ_tab_file", dest="SJ_tab_file", type=str, required=True, help="STAR SJ.out.tab file", ) parser.add_argument( "--chimJ_file", dest="chimJ_file", type=str, required=False, default=None, help="STAR Chimeric.out.junction file", ) parser.add_argument( "--output_prefix", dest="output_prefix", type=str, required=True, help="prefix for all output files", ) parser.add_argument( "--min_total_reads", dest="min_total_reads", type=int, required=False, default=5, help="minimum reads supporting cancer intron", ) parser.add_argument( "--vis", action="store_true", default=False, help= "Generate igv html ctat splicing visualization (requires --bam_file to be set)", ) parser.add_argument( "--bam_file", dest="bam_file", type=str, required=False, default=None, help="STAR generated BAM file", ) parser.add_argument( "--sample_name", dest="vis_sample_name", type=str, required=False, default="", help="sample name for vis title", ) args = parser.parse_args() ctat_genome_lib = args.ctat_genome_lib if not os.path.exists(ctat_genome_lib): raise RuntimeError("Error, must set --ctat_genome_lib ") SJ_tab_file = args.SJ_tab_file chimJ_file = args.chimJ_file output_prefix = args.output_prefix bam_file = args.bam_file VIS_flag = args.vis min_total_reads = args.min_total_reads vis_sample_name = args.vis_sample_name if VIS_flag and not bam_file: raise RuntimeError("Error, if --vis, must specify --bam_file ") if not os.path.exists(SJ_tab_file): raise RuntimeError( "Error, cannot locate expected splice junction tab file: {} ". format(SJ_tab_file)) chckpts_dir = output_prefix + ".chckpts" if not os.path.exists(chckpts_dir): os.makedirs(chckpts_dir) pipeliner = Pipeliner(chckpts_dir) introns_output_file = output_prefix + ".introns" introns_output_file_chckpt = os.path.join(chckpts_dir, "introns.ok") if not os.path.exists(introns_output_file_chckpt): targets_list_file = os.path.join(ctat_genome_lib, "ref_annot.gtf.mini.sortu") chr_intron_bounds = ioc.populate_intron_bounds(targets_list_file) introns_dict = ioc.map_introns_from_splice_tab(SJ_tab_file, chr_intron_bounds) if chimJ_file is not None: if not os.path.exists(chimJ_file): raise RuntimeError( "Error, cannot locate expected chimeric Junctiom out file: {} " .format(chimJ_file)) # must make splice file: chimJ_introns_file = (output_prefix + "." + os.path.basename(chimJ_file) + ".introns.tmp") cmd = str( os.path.join(utildir, "STAR_chimeric_junctions_to_introns.pl") + " -J {} > {}".format(chimJ_file, chimJ_introns_file)) subprocess.check_call(cmd, shell=True) introns_dict = ioc.supplement_introns_from_chimeric_junctions_file( chimJ_introns_file, introns_dict, chr_intron_bounds) with open(introns_output_file, "wt") as ofh: # write header ofh.write("\t".join( ["intron", "strand", "genes", "uniq_mapped", "multi_mapped"]) + "\n") for intron in introns_dict.values(): ofh.write("\t".join([ "{}:{}-{}".format(intron.chromosome, intron.lend, intron.rend), intron.strand, intron.genes, str(intron.uniq_mapped), str(intron.multi_mapped), ]) + "\n") # done, add checkpoint subprocess.check_call("touch {}".format(introns_output_file_chckpt), shell=True) # annotate for cancer introns. cancer_introns_file_prelim = output_prefix + ".cancer.introns.prelim" cmd = str( os.path.join(utildir, "annotate_cancer_introns.pl") + " --introns_file {} ".format(introns_output_file) + " --ctat_genome_lib {} ".format(ctat_genome_lib) + " --intron_col 0 " + " > {} ".format(cancer_introns_file_prelim)) pipeliner.add_commands([Command(cmd, "prelim_introns.ok")]) # filter for min support cancer_introns_file = output_prefix + ".cancer.introns" cmd = str( os.path.join(utildir, "filter_by_min_total_reads.py") + " --cancer_intron_candidates {}".format(cancer_introns_file_prelim) + " --min_total_reads {} ".format(min_total_reads) + " > {} ".format(cancer_introns_file)) pipeliner.add_commands([Command(cmd, "introns_filtered.ok")]) pipeliner.run() if VIS_flag: # generate the intron/junctions bed needed by igv igv_introns_bed_file = introns_output_file + ".for_IGV.bed" cmd = str( os.path.join(utildir, "make_igv_splice_bed.py") + " --all_introns {} ".format(introns_output_file) + " --cancer_introns {} ".format(cancer_introns_file) + " --genome_lib_dir {} ".format(ctat_genome_lib) + " --output_bed {} ".format(igv_introns_bed_file)) pipeliner.add_commands([Command(cmd, "intron_igv_bed.ok")]) pipeliner.run() igv_tracks_config_file = write_igv_config( output_prefix, ctat_genome_lib, igv_introns_bed_file, bam_file, os.path.join(utildir, "misc/igv.tracks.json"), pipeliner, ) # Create the IGV Reports cmd = str( "create_report {} ".format(igv_introns_bed_file) + " {} ".format(os.path.join(ctat_genome_lib, "ref_genome.fa")) + " --type junction " + " --output {}.ctat-splicing.igv.html ".format(output_prefix) + " --track-config {} ".format(igv_tracks_config_file) + " --info-columns gene variant_name uniquely_mapped multi_mapped TCGA GTEx " + " --title 'CTAT_Splicing: {}' ".format(vis_sample_name)) pipeliner.add_commands([Command(cmd, "igv_create_html.ok")]) pipeliner.run() logger.info("done.") sys.exit(0)