def __main__(): logging.basicConfig(level=logging.INFO) time.sleep(1) #small hack, sometimes it seems like docker file systems aren't avalible instantly parser = argparse.ArgumentParser(description='') parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file') parser.add_argument('-R', dest='inputFastaName', default="genome", help='the reference name') parser.add_argument('-b', dest='inputBamFiles', default=[], action="append", help='the bam file') parser.add_argument('-bi', dest='inputBamFileIndexes', default=[], action="append", help='the bam file') parser.add_argument('-s', dest='insert_sizes', type=int, default=[], action="append", required=False, help='the insert size') parser.add_argument('-t', dest='sampleTags', default=[], action="append", help='the sample tag') parser.add_argument('-o1', dest='outputRaw', help='the output raw', default=None) parser.add_argument('-o2', dest='outputVcfFile', help='the output vcf', default=None) parser.add_argument('-o3', dest='outputSomaticVcfFile', help='the output somatic filtered vcf', default=None) parser.add_argument('--number_of_threads', dest='number_of_threads', type=int, default=2) parser.add_argument('--number_of_procs', dest='procs', type=int, default=1) parser.add_argument('--breakdancer', dest='breakdancer') parser.add_argument('-x', '--max_range_index', dest='max_range_index', type=int, default=None) parser.add_argument('--window_size', dest='window_size', type=int, default=None) parser.add_argument('--sequencing_error_rate', dest='sequencing_error_rate', type=float, default=None) parser.add_argument('--sensitivity', dest='sensitivity', default=None, type=float) parser.add_argument('--report_long_insertions', dest='report_long_insertions', action='store_true', default=False) parser.add_argument('--report_duplications', dest='report_duplications', action='store_true', default=False) parser.add_argument('--report_inversions', dest='report_inversions', action='store_true', default=False) parser.add_argument('--report_breakpoints', dest='report_breakpoints', action='store_true', default=False) parser.add_argument('-u', '--maximum_allowed_mismatch_rate', dest='maximum_allowed_mismatch_rate', type=float, default=None) parser.add_argument('--report_close_mapped_reads', dest='report_close_mapped_reads', action='store_true', default=False) parser.add_argument('--report_only_close_mapped_reads', dest='report_only_close_mapped_reads', action='store_true', default=False) parser.add_argument('--report_interchromosomal_events', dest='report_interchromosomal_events', action='store_true', default=False) parser.add_argument('--IndelCorrection', dest='IndelCorrection', action='store_true', default=False) parser.add_argument('--NormalSamples', dest='NormalSamples', action='store_true', default=False) parser.add_argument('-a', '--additional_mismatch', dest='additional_mismatch', type=int, default=None) parser.add_argument('-m', '--min_perfect_match_around_BP', dest='min_perfect_match_around_BP', type=int, default=None) parser.add_argument('-v', '--min_inversion_size', dest='min_inversion_size', type=int, default=None) parser.add_argument('-d', '--min_num_matched_bases', dest='min_num_matched_bases', type=int, default=None) parser.add_argument('-B', '--balance_cutoff', dest='balance_cutoff', type=int, default=None) parser.add_argument('-A', '--anchor_quality', dest='anchor_quality', type=int, default=None) parser.add_argument('-M', '--minimum_support_for_event', dest='minimum_support_for_event', type=int, default=None) parser.add_argument('-n', '--NM', dest='NM', type=int, default=None) parser.add_argument('--detect_DD', dest='detect_DD', action='store_true', default=False) parser.add_argument('--MAX_DD_BREAKPOINT_DISTANCE', dest='MAX_DD_BREAKPOINT_DISTANCE', type=int, default='350') parser.add_argument('--MAX_DISTANCE_CLUSTER_READS', dest='MAX_DISTANCE_CLUSTER_READS', type=int, default='100') parser.add_argument('--MIN_DD_CLUSTER_SIZE', dest='MIN_DD_CLUSTER_SIZE', type=int, default='3') parser.add_argument('--MIN_DD_BREAKPOINT_SUPPORT', dest='MIN_DD_BREAKPOINT_SUPPORT', type=int, default='3') parser.add_argument('--MIN_DD_MAP_DISTANCE', dest='MIN_DD_MAP_DISTANCE', type=int, default='8000') parser.add_argument('--DD_REPORT_DUPLICATION_READS', dest='DD_REPORT_DUPLICATION_READS', action='store_true', default=False) parser.add_argument('--somatic_vaf', type=float, default=0.08) parser.add_argument('--somatic_cov', type=int, default=20) parser.add_argument('--somatic_hom', type=int, default=6) parser.add_argument("-J", "--exclude", dest="exclude", default=None) parser.add_argument("-j", "--include", dest="include", default=None) parser.add_argument('--min_chrom_size', dest='min_chrom_size', type=int, default='1') parser.add_argument('-z', '--input_SV_Calls_for_assembly', dest='input_SV_Calls_for_assembly', action='store_true', default=False) parser.add_argument('--workdir', default="./") parser.add_argument('--no_clean', action="store_true", default=False) args = parser.parse_args() inputBamFiles = list( os.path.abspath(a) for a in args.inputBamFiles ) if len(inputBamFiles) == 0: logging.error("Need input files") sys.exit(1) inputBamFileIndexes = list( os.path.abspath(a) for a in args.inputBamFileIndexes ) if len(inputBamFileIndexes) == 0: inputBamFileIndexes = [None] * len(inputBamFiles) if len(inputBamFileIndexes) != len(inputBamFiles): logging.error("Index file count needs to undefined or match input file count") sys.exit(1) insertSizes = args.insert_sizes if len(insertSizes) == 0: insertSizes = [None] * len(inputBamFiles) if len(insertSizes) != len(inputBamFiles): logging.error("Insert Sizes needs to undefined or match input file count") sys.exit(1) sampleTags = args.sampleTags if len(sampleTags) != len(inputBamFiles): logging.error("Sample Tags need to match input file count") sys.exit(1) tempDir = tempfile.mkdtemp(dir=args.workdir, prefix="pindel_work_") print(tempDir) try: meanInsertSizes = [] seq_hash = {} newInputFiles = [] i = 0 #make sure the BAMs are indexed and get the mean insert sizes for inputBamFile, inputBamIndex, insertSize, sampleTag in zip(inputBamFiles, inputBamFileIndexes, insertSizes, sampleTags ): inputFastaFile, inputBamFile = indexBam(args.workdir, args.inputFastaFile, inputBamFile, i, inputBamIndex) i += 1 newInputFiles.append(inputBamFile) if insertSize==None: meanInsertSize = getMeanInsertSize(inputBamFile) else: meanInsertSize=insertSize meanInsertSizes.append( meanInsertSize ) for seq in get_bam_seq(inputBamFile, args.min_chrom_size): seq_hash[seq] = True seqs = seq_hash.keys() configFile = config(newInputFiles, meanInsertSizes, sampleTags, tempDir) #run pindel pindel_files = [] if args.procs == 1: cmd, pindelFileBase = pindel(inputFastaFile, configFile, args, tempDir) execute(cmd) for suffix in ["_D", "_SI", "_LI", "_INV", "_TD"]: if os.path.exists(pindelFileBase + suffix): pindel_files.append( pindelFileBase + suffix ) else: cmds = [] runs = [] for a in seqs: cmd, pindelFileBase = pindel(inputFastaFile, configFile, args, tempDir, a) cmds.append(cmd) runs.append(pindelFileBase) p = Pool(args.procs) values = p.map(execute, cmds, 1) for pindelFileBase in runs: for suffix in ["_D", "_SI", "_LI", "_INV", "_TD"]: if os.path.exists(pindelFileBase + suffix): pindel_files.append( pindelFileBase + suffix ) #run pindel2vcf with open(os.path.join(args.workdir, "pindel_all"), "w") as handle: for p in pindel_files: with open(p) as ihandle: for line in ihandle: handle.write(line) if args.outputRaw is not None: shutil.copy(os.path.join(args.workdir, "pindel_all"), args.outputRaw) if args.outputVcfFile is not None: cmd = pindel2vcf(inputFastaFile, args.inputFastaName, os.path.join(args.workdir, "pindel_all"), args.outputVcfFile) execute(cmd) if args.outputSomaticVcfFile is not None: with open(os.path.join(args.workdir, "pindel_somatic"), "w") as handle: for p in pindel_files: if p.endswith("_D"): with open(p) as ihandle: for line in ihandle: if re.search("ChrID", line): handle.write(line) for p in pindel_files: if p.endswith("_SI"): with open(p) as ihandle: for line in ihandle: if re.search("ChrID", line): handle.write(line) with open(os.path.join(args.workdir, "somatic.indel.filter.config"), "w") as handle: handle.write("indel.filter.input = %s\n" % os.path.join(args.workdir, "pindel_somatic")) handle.write("indel.filter.vaf = %s\n" % (args.somatic_vaf)) handle.write("indel.filter.cov = %s\n" % (args.somatic_cov)) handle.write("indel.filter.hom = %s\n" % (args.somatic_hom)) handle.write("indel.filter.pindel2vcf = %s\n" % (which("pindel2vcf"))) handle.write("indel.filter.reference = %s\n" % (inputFastaFile)) handle.write("indel.filter.referencename = %s\n" % (args.inputFastaName)) handle.write("indel.filter.referencedate = %s\n" % (datetime.datetime.now().strftime("%Y%m%d")) ) handle.write("indel.filter.output = %s\n" % (args.outputSomaticVcfFile)) # The hard-coded paths need to be removed. execute("%s ~/bin/somatic_indelfilter.pl %s" % (which("perl"), os.path.join(args.workdir, "somatic.indel.filter.config")) ) finally: if not args.no_clean and os.path.exists(tempDir): shutil.rmtree(tempDir)
def __main__(): logging.basicConfig(level=logging.INFO) time.sleep( 1 ) #small hack, sometimes it seems like docker file systems aren't avalible instantly parser = argparse.ArgumentParser(description='') parser.add_argument('-r', dest='inputFastaFile', required=True, help='the reference file') parser.add_argument('-R', dest='inputFastaName', default="genome", help='the reference name') parser.add_argument('-b', dest='inputBamFiles', default=[], action="append", help='the bam file') parser.add_argument('-bi', dest='inputBamFileIndexes', default=[], action="append", help='the bam file') parser.add_argument('-s', dest='insert_sizes', type=int, default=[], action="append", required=False, help='the insert size') parser.add_argument('-t', dest='sampleTags', default=[], action="append", help='the sample tag') parser.add_argument('-o1', dest='outputRaw', help='the output raw', default=None) parser.add_argument('-o2', dest='outputVcfFile', help='the output vcf', default=None) parser.add_argument('-o3', dest='outputSomaticVcfFile', help='the output somatic filtered vcf', default=None) parser.add_argument('--number_of_threads', dest='number_of_threads', type=int, default=2) parser.add_argument('--number_of_procs', dest='procs', type=int, default=1) parser.add_argument('--breakdancer', dest='breakdancer') parser.add_argument('-x', '--max_range_index', dest='max_range_index', type=int, default=None) parser.add_argument('--window_size', dest='window_size', type=int, default=None) parser.add_argument('--sequencing_error_rate', dest='sequencing_error_rate', type=float, default=None) parser.add_argument('--sensitivity', dest='sensitivity', default=None, type=float) parser.add_argument('--report_long_insertions', dest='report_long_insertions', action='store_true', default=False) parser.add_argument('--report_duplications', dest='report_duplications', action='store_true', default=False) parser.add_argument('--report_inversions', dest='report_inversions', action='store_true', default=False) parser.add_argument('--report_breakpoints', dest='report_breakpoints', action='store_true', default=False) parser.add_argument('-u', '--maximum_allowed_mismatch_rate', dest='maximum_allowed_mismatch_rate', type=float, default=None) parser.add_argument('--report_close_mapped_reads', dest='report_close_mapped_reads', action='store_true', default=False) parser.add_argument('--report_only_close_mapped_reads', dest='report_only_close_mapped_reads', action='store_true', default=False) parser.add_argument('--report_interchromosomal_events', dest='report_interchromosomal_events', action='store_true', default=False) parser.add_argument('--IndelCorrection', dest='IndelCorrection', action='store_true', default=False) parser.add_argument('--NormalSamples', dest='NormalSamples', action='store_true', default=False) parser.add_argument('-a', '--additional_mismatch', dest='additional_mismatch', type=int, default=None) parser.add_argument('-m', '--min_perfect_match_around_BP', dest='min_perfect_match_around_BP', type=int, default=None) parser.add_argument('-v', '--min_inversion_size', dest='min_inversion_size', type=int, default=None) parser.add_argument('-d', '--min_num_matched_bases', dest='min_num_matched_bases', type=int, default=None) parser.add_argument('-B', '--balance_cutoff', dest='balance_cutoff', type=int, default=None) parser.add_argument('-A', '--anchor_quality', dest='anchor_quality', type=int, default=None) parser.add_argument('-M', '--minimum_support_for_event', dest='minimum_support_for_event', type=int, default=None) parser.add_argument('-n', '--NM', dest='NM', type=int, default=None) parser.add_argument('--detect_DD', dest='detect_DD', action='store_true', default=False) parser.add_argument('--MAX_DD_BREAKPOINT_DISTANCE', dest='MAX_DD_BREAKPOINT_DISTANCE', type=int, default='350') parser.add_argument('--MAX_DISTANCE_CLUSTER_READS', dest='MAX_DISTANCE_CLUSTER_READS', type=int, default='100') parser.add_argument('--MIN_DD_CLUSTER_SIZE', dest='MIN_DD_CLUSTER_SIZE', type=int, default='3') parser.add_argument('--MIN_DD_BREAKPOINT_SUPPORT', dest='MIN_DD_BREAKPOINT_SUPPORT', type=int, default='3') parser.add_argument('--MIN_DD_MAP_DISTANCE', dest='MIN_DD_MAP_DISTANCE', type=int, default='8000') parser.add_argument('--DD_REPORT_DUPLICATION_READS', dest='DD_REPORT_DUPLICATION_READS', action='store_true', default=False) parser.add_argument('--somatic_vaf', type=float, default=0.08) parser.add_argument('--somatic_cov', type=int, default=20) parser.add_argument('--somatic_hom', type=int, default=6) parser.add_argument("-J", "--exclude", dest="exclude", default=None) parser.add_argument("-j", "--include", dest="include", default=None) parser.add_argument('--min_chrom_size', dest='min_chrom_size', type=int, default='1') parser.add_argument('-z', '--input_SV_Calls_for_assembly', dest='input_SV_Calls_for_assembly', action='store_true', default=False) parser.add_argument('--workdir', default="./") parser.add_argument('--no_clean', action="store_true", default=False) args = parser.parse_args() inputBamFiles = list(os.path.abspath(a) for a in args.inputBamFiles) if len(inputBamFiles) == 0: logging.error("Need input files") sys.exit(1) inputBamFileIndexes = list( os.path.abspath(a) for a in args.inputBamFileIndexes) if len(inputBamFileIndexes) == 0: inputBamFileIndexes = [None] * len(inputBamFiles) if len(inputBamFileIndexes) != len(inputBamFiles): logging.error( "Index file count needs to undefined or match input file count") sys.exit(1) insertSizes = args.insert_sizes if len(insertSizes) == 0: insertSizes = [None] * len(inputBamFiles) if len(insertSizes) != len(inputBamFiles): logging.error( "Insert Sizes needs to undefined or match input file count") sys.exit(1) sampleTags = args.sampleTags if len(sampleTags) != len(inputBamFiles): logging.error("Sample Tags need to match input file count") sys.exit(1) tempDir = tempfile.mkdtemp(dir=args.workdir, prefix="pindel_work_") print(tempDir) try: meanInsertSizes = [] seq_hash = {} newInputFiles = [] i = 0 #make sure the BAMs are indexed and get the mean insert sizes for inputBamFile, inputBamIndex, insertSize, sampleTag in zip( inputBamFiles, inputBamFileIndexes, insertSizes, sampleTags): inputFastaFile, inputBamFile = indexBam(args.workdir, args.inputFastaFile, inputBamFile, i, inputBamIndex) i += 1 newInputFiles.append(inputBamFile) if insertSize == None: meanInsertSize = getMeanInsertSize(inputBamFile) else: meanInsertSize = insertSize meanInsertSizes.append(meanInsertSize) for seq in get_bam_seq(inputBamFile, args.min_chrom_size): seq_hash[seq] = True seqs = seq_hash.keys() configFile = config(newInputFiles, meanInsertSizes, sampleTags, tempDir) #run pindel pindel_files = [] if args.procs == 1: cmd, pindelFileBase = pindel(inputFastaFile, configFile, args, tempDir) execute(cmd) for suffix in ["_D", "_SI", "_LI", "_INV", "_TD"]: if os.path.exists(pindelFileBase + suffix): pindel_files.append(pindelFileBase + suffix) else: cmds = [] runs = [] for a in seqs: cmd, pindelFileBase = pindel(inputFastaFile, configFile, args, tempDir, a) cmds.append(cmd) runs.append(pindelFileBase) p = Pool(args.procs) values = p.map(execute, cmds, 1) for pindelFileBase in runs: for suffix in ["_D", "_SI", "_LI", "_INV", "_TD"]: if os.path.exists(pindelFileBase + suffix): pindel_files.append(pindelFileBase + suffix) #run pindel2vcf with open(os.path.join(args.workdir, "pindel_all"), "w") as handle: for p in pindel_files: with open(p) as ihandle: for line in ihandle: handle.write(line) if args.outputRaw is not None: shutil.copy(os.path.join(args.workdir, "pindel_all"), args.outputRaw) if args.outputVcfFile is not None: cmd = pindel2vcf(inputFastaFile, args.inputFastaName, os.path.join(args.workdir, "pindel_all"), args.outputVcfFile) execute(cmd) if args.outputSomaticVcfFile is not None: with open(os.path.join(args.workdir, "pindel_somatic"), "w") as handle: for p in pindel_files: if p.endswith("_D"): with open(p) as ihandle: for line in ihandle: if re.search("ChrID", line): handle.write(line) for p in pindel_files: if p.endswith("_SI"): with open(p) as ihandle: for line in ihandle: if re.search("ChrID", line): handle.write(line) with open( os.path.join(args.workdir, "somatic.indel.filter.config"), "w") as handle: handle.write("indel.filter.input = %s\n" % os.path.join(args.workdir, "pindel_somatic")) handle.write("indel.filter.vaf = %s\n" % (args.somatic_vaf)) handle.write("indel.filter.cov = %s\n" % (args.somatic_cov)) handle.write("indel.filter.hom = %s\n" % (args.somatic_hom)) handle.write("indel.filter.pindel2vcf = %s\n" % (which("pindel2vcf"))) handle.write("indel.filter.reference = %s\n" % (inputFastaFile)) handle.write("indel.filter.referencename = %s\n" % (args.inputFastaName)) handle.write("indel.filter.referencedate = %s\n" % (datetime.datetime.now().strftime("%Y%m%d"))) handle.write("indel.filter.output = %s\n" % (args.outputSomaticVcfFile)) execute( "%s /home/exacloud/clinical/RichardsLab/bin/somatic_indelfilter.pl %s" % (which("perl"), os.path.join(args.workdir, "somatic.indel.filter.config"))) finally: if not args.no_clean and os.path.exists(tempDir): shutil.rmtree(tempDir)