def main(): # Parse & interpret command line flags. parser = argparse.ArgumentParser(description='Given paired-end .fastq/.fastq.gz files, map to a genome.', epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at " "Washington University in St. Louis: https://gordonlab.wustl.edu.", usage='%(prog)s [options]', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, help='Input path.', required=True) parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, help='Output path.', required=True) parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, choices=['mm9', 'mm10', 'hg18', 'hg19'], help='Genome to use for bowtie2', required=True) parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true', help='Disable parallel job spawning.') parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', help="Do not create a log file.") args = parser.parse_args() output_path = _script_helpers.setup_output_path(args.output_path) _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) paired_end_mapping = find_paired_ends(args.input_path, verbose=args.verbose) run_bowtie2(paired_end_mapping, args.genome, output_path)
def main(): # Parse & interpret command line flags. parser = argparse.ArgumentParser(description='Pool multiple .bams together for the same sample.' 'Note: This is *only* necessary if you sequenced the same sample multiple times.', epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at " "Washington University in St. Louis: https://gordonlab.wustl.edu.", usage='%(prog)s [options]', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-files', '-i', dest="input_files", metavar='input_dir/', type=str, help='Input files. (Not just a path!)', required=True, nargs='+') parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, help='Output path.', required=True) parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', help="Do not create a log file.") args = parser.parse_args() output_path = _script_helpers.setup_output_path(args.output_path) _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) merge_and_rmdup(args.input_files, output_path)
def main(): # Parse & interpret command line flags. parser = argparse.ArgumentParser(description='Run a number of standard peak calling algorithms for ATAC-seq data. ' 'Expects de-duplicated, sorted, merged, ChrM-removed data.', epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at " "Washington University in St. Louis: https://gordonlab.wustl.edu.", usage='%(prog)s [options]', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, help='Input path (or a specific .bam file).', required=True) parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, help='Output path.', required=True) parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, choices=['ms', 'mm', 'ce', 'dm'], help='Genome size to pass to MACS.', required=True) # TODO: Consider using mm9/mm10, etc. for uniformity? parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true', help='Disable parallel job spawning.') parser.add_argument('--skip-bam-indexing', dest="skip_bam_indexing", action='store_true', help='Skip bam indexing (You must have generated indexes independently for peak callers to work!).', required=False) parser.add_argument('--skip-macs14', dest="skip_macs14", action='store_true', help='Skip MACS v1.4 peak calling.', required=False) parser.add_argument('--skip-macs2', dest="skip_macs2", action='store_true', help='Skip MACS v2 peak calling.', required=False) parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', help="Do not create a log file.") args = parser.parse_args() output_path = _script_helpers.setup_output_path(args.output_path) log_main = _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) input_files = _script_helpers.validate_input_files(args.input_path) # Generate BAM indexes if not args.skip_bam_indexing: generate_index(input_files, output_path, disable_parallel=args.no_parallel) else: log_main.warn("Skipping bam index .bai generation as requested.") log_main.warn("You must have generated these separately, otherwise peak callers will fail.") if not args.skip_macs14: # Start with old-school MACS 1.4 run_macs14(input_files, output_path, args.genome, disable_parallel=args.no_parallel) if not args.skip_macs2: # Now new MACS 2 # macs2 callpeak --nomodel -t $BAM -n $OUT --nolambda --keep-dup all --slocal 10000 run_macs2(input_files, output_path, args.genome, disable_parallel=args.no_parallel)
def main(): # Parse & interpret command line flags. parser = argparse.ArgumentParser(description='Intelligently merge fastq/fastq.gz files from an Illumina pipeline.' 'Merges all L*_R*_* .fastq.gz files into one per sample.', epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at " "Washington University in St. Louis: http://gordonlab.wustl.edu.", usage='%(prog)s [options]', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, help='Input path.', required=True) parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, help='Output path.', required=True) parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true', help='Disable parallel job spawning.') # parser.add_argument('--skip-stats', dest="skip_stats", action='store_true', # help='Skip statistics generation.', required=False) parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', help="Do not create a log file.") args = parser.parse_args() output_path = _script_helpers.setup_output_path(args.output_path) _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) # Our goal is to intelligently merge .fastq/.fastq.gz output from an Illumina run # The Illumina standard pipeline splits by barcode w/ semi-predictable filenames we can use, e.g. # IsoA-M1-CD4_S1_L001_I1_001.fastq.gz # index (discard) # IsoA-M1-CD4_S1_L001_R1_001.fastq.gz # end 1, lane 1 # IsoA-M1-CD4_S1_L001_R2_001.fastq.gz # end 2, lane 2 # IsoA-M1-CD4_S1_L002_I1_001.fastq.gz # index (discard), lane 2 # IsoA-M1-CD4_S1_L002_R1_001.fastq.gz # end 1, lane 2 # ... # TODO: Move some lower glob code up so we can test these functions merge_strategy = fastq_map_predict(args.input_path, verbose=args.verbose) fastq_merge(merge_strategy, args.output_path, disable_parallel=args.no_parallel)
def main(): # Parse & interpret command line flags. parser = argparse.ArgumentParser(description='Given input .bam files, fix matepairs, remove duplicates, blacklist bad' 'regions, and sort the output.', epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at " "Washington University in St. Louis: http://gordonlab.wustl.edu.", usage='%(prog)s [options]', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, help='Input path.', required=True) parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, help='Output path.', required=True) parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, choices=['mm9', 'mm10', 'hg18', 'hg19'], help='Genome to use for blacklisting.', required=True) parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true', help='Disable parallel job spawning.') parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', help="Do not create a log file.") args = parser.parse_args() output_path = _script_helpers.setup_output_path(args.output_path) log_main = _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) # Samtools requires a temp directory for sorting /sometimes/. # This seems to only matter if it exceeds the in-ram limits set by the MAX_MEM parameter. # Sanity check the /tmp directory has a bit of space. temppath = tempfile.gettempdir() s = os.statvfs(temppath) if ((s.f_bavail * s.f_frsize) / (1024 * 1024)) < 10000: # ~10 G, not for any good reason though log_main.warn('Temp directory %s doesn\'t have a lot of free space!' % (temppath)) input_files = glob.glob(args.input_path + "/*.bam") # Take ALL of the .bams. large_filter_fixmate_and_sort(input_files, args.genome, output_path, disable_parallel=args.no_parallel) rmdup_and_blacklist(input_files, args.genome, output_path, disable_parallel=args.no_parallel)
def main(): # Parse & interpret command line flags. parser = argparse.ArgumentParser( description= 'Run a number of standard peak calling algorithms for ATAC-seq data. ' 'Expects de-duplicated, sorted, merged, ChrM-removed data.', epilog= "Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at " "Washington University in St. Louis: https://gordonlab.wustl.edu.", usage='%(prog)s [options]', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, help='Input path (or a specific .bam file).', required=True) parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, help='Output path.', required=True) parser.add_argument( '--genome', '-g', dest="genome", metavar='genome', type=str, choices=['ms', 'mm', 'ce', 'dm'], help='Genome size to pass to MACS.', required=True) # TODO: Consider using mm9/mm10, etc. for uniformity? parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true', help='Disable parallel job spawning.') parser.add_argument( '--skip-bam-indexing', dest="skip_bam_indexing", action='store_true', help= 'Skip bam indexing (You must have generated indexes independently for peak callers to work!).', required=False) parser.add_argument('--skip-macs14', dest="skip_macs14", action='store_true', help='Skip MACS v1.4 peak calling.', required=False) parser.add_argument('--skip-macs2', dest="skip_macs2", action='store_true', help='Skip MACS v2 peak calling.', required=False) parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', help="Do not create a log file.") args = parser.parse_args() output_path = _script_helpers.setup_output_path(args.output_path) log_main = _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) input_files = _script_helpers.validate_input_files(args.input_path) # Generate BAM indexes if not args.skip_bam_indexing: generate_index(input_files, output_path, disable_parallel=args.no_parallel) else: log_main.warn("Skipping bam index .bai generation as requested.") log_main.warn( "You must have generated these separately, otherwise peak callers will fail." ) if not args.skip_macs14: # Start with old-school MACS 1.4 run_macs14(input_files, output_path, args.genome, disable_parallel=args.no_parallel) if not args.skip_macs2: # Now new MACS 2 # macs2 callpeak --nomodel -t $BAM -n $OUT --nolambda --keep-dup all --slocal 10000 run_macs2(input_files, output_path, args.genome, disable_parallel=args.no_parallel)
def main(): # Parse & interpret command line flags. parser = argparse.ArgumentParser(description='Convert hdf5 tables from bamliquidator format to CSV counts tables ' 'for use in R and elsewhere. (Necessary as rhdf5 doesn\'t support our data structure.)', epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at " "Washington University in St. Louis: http://gordonlab.wustl.edu.", usage='%(prog)s [options]', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, help='Input path with .h5 files.', required=True) parser.add_argument("--overwrite", dest="overwrite", default=False, action='store_true', help='Regenerate and overwrite output .tsv files, even if they already exist.') parser.add_argument('--call-genes', dest="call_genes", default=False, action='store_true', help='Instead of a .tsv (with positions as keys), make a .annotated.tsv with nearby genes.') parser.add_argument('--normalized', dest="normalized", default=False, action='store_true', help='Store the normalized counts (counts/total reads) instead of the raw read counts.') parser.add_argument('--density', dest="density", default=False, action='store_true', help='Store the width-normalized density (counts/total reads/region size) instead of the raw read counts.') parser.add_argument('--sizescaled', dest="sizescaled", default=False, action='store_true', help='Store the size scaled counts (counts/feature size) instead of the raw read counts.') # Useful for EdgeR/DESeq2, etc. where every locus/position/gene-name must be unique. parser.add_argument('--flatten', dest="flatten", default=False, action='store_true', help='Aggregate identical locus IDs and sum their values. ' 'Think carefully before you sum non-normalized values!') genome_choices = sorted(CONFIG['gffs'].keys()) parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, default=None, choices=genome_choices, help='Genome to use for annotation, one of: %s' % (', '.join(genome_choices)), required=False) parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', help="Do not create a log file.") args = parser.parse_args() if args.call_genes and not args.genome: parser.error('--genome is when requesting --call_genes') assert((args.density + args.normalized + args.sizescaled) <= 1) annotationBedTool = None if args.call_genes: genome_gff = CONFIG['gffs'][args.genome] assert(os.access(genome_gff, os.R_OK)) annotationBedTool = pybedtools.BedTool(genome_gff) # Output path is input path. This also checks that the path is writeable. output_path = _script_helpers.setup_output_path(args.input_path) _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) input_files = get_input_files(args.input_path) parse_h5files(input_files, annotationBedTool=annotationBedTool, overwrite=args.overwrite, flatten=args.flatten, density=args.density, normalized=args.normalized, sizescaled=args.sizescaled)
def main(): # Parse & interpret command line flags. parser = argparse.ArgumentParser( description='Given paired-end .fastq/.fastq.gz files, map to a genome.', epilog= "Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at " "Washington University in St. Louis: https://gordonlab.wustl.edu.", usage='%(prog)s [options]', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str, help='Input path.', required=True) parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str, help='Output path.', required=True) parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, choices=['mm9', 'mm10', 'hg18', 'hg19'], help='Genome to use for bowtie2', required=True) parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true', help='Disable parallel job spawning.') parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true') parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true', help="Do not create a log file.") args = parser.parse_args() output_path = _script_helpers.setup_output_path(args.output_path) _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path) paired_end_mapping = find_paired_ends(args.input_path, verbose=args.verbose) run_bowtie2(paired_end_mapping, args.genome, output_path)