def main():
    # Parse & interpret command line flags.
    parser = argparse.ArgumentParser(description='Given paired-end .fastq/.fastq.gz files, map to a genome.',
                                     epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at "
                                            "Washington University in St. Louis: https://gordonlab.wustl.edu.",
                                     usage='%(prog)s [options]',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str,
                        help='Input path.', required=True)
    parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str,
                        help='Output path.', required=True)
    parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str,
                        choices=['mm9', 'mm10', 'hg18', 'hg19'], help='Genome to use for bowtie2', required=True)
    parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true',
                        help='Disable parallel job spawning.')


    parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')

    parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
                        help="Do not create a log file.")

    args = parser.parse_args()

    output_path = _script_helpers.setup_output_path(args.output_path)

    _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)


    paired_end_mapping = find_paired_ends(args.input_path, verbose=args.verbose)

    run_bowtie2(paired_end_mapping, args.genome, output_path)
Example #2
0
def main():
    # Parse & interpret command line flags.
    parser = argparse.ArgumentParser(description='Pool multiple .bams together for the same sample.'
                                                 'Note: This is *only* necessary if you sequenced the same sample multiple times.',
                                     epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at "
                                            "Washington University in St. Louis: https://gordonlab.wustl.edu.",
                                     usage='%(prog)s [options]',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--input-files', '-i', dest="input_files", metavar='input_dir/', type=str,
                        help='Input files. (Not just a path!)', required=True, nargs='+')
    parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str,
                        help='Output path.', required=True)

    parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')

    parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
                        help="Do not create a log file.")

    args = parser.parse_args()

    output_path = _script_helpers.setup_output_path(args.output_path)

    _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)

    merge_and_rmdup(args.input_files, output_path)
def main():
    # Parse & interpret command line flags.
    parser = argparse.ArgumentParser(description='Run a number of standard peak calling algorithms for ATAC-seq data. '
                                                 'Expects de-duplicated, sorted, merged, ChrM-removed data.',
                                     epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at "
                                            "Washington University in St. Louis: https://gordonlab.wustl.edu.",
                                     usage='%(prog)s [options]',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str,
                        help='Input path (or a specific .bam file).', required=True)
    parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str,
                        help='Output path.', required=True)
    parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str,
                        choices=['ms', 'mm', 'ce', 'dm'], help='Genome size to pass to MACS.', required=True)  # TODO: Consider using mm9/mm10, etc. for uniformity?
    parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true',
                        help='Disable parallel job spawning.')

    parser.add_argument('--skip-bam-indexing', dest="skip_bam_indexing", action='store_true',
                        help='Skip bam indexing (You must have generated indexes independently for peak callers to work!).', required=False)

    parser.add_argument('--skip-macs14', dest="skip_macs14", action='store_true',
                        help='Skip MACS v1.4 peak calling.', required=False)
    parser.add_argument('--skip-macs2', dest="skip_macs2", action='store_true',
                        help='Skip MACS v2 peak calling.', required=False)


    parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')

    parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
                        help="Do not create a log file.")

    args = parser.parse_args()

    output_path = _script_helpers.setup_output_path(args.output_path)

    log_main = _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)

    input_files = _script_helpers.validate_input_files(args.input_path)


    # Generate BAM indexes
    if not args.skip_bam_indexing:
        generate_index(input_files, output_path, disable_parallel=args.no_parallel)
    else:
        log_main.warn("Skipping bam index .bai generation as requested.")
        log_main.warn("You must have generated these separately, otherwise peak callers will fail.")

    if not args.skip_macs14:
        # Start with old-school MACS 1.4
        run_macs14(input_files, output_path, args.genome, disable_parallel=args.no_parallel)

    if not args.skip_macs2:
        # Now new MACS 2
        # macs2 callpeak --nomodel -t $BAM -n $OUT --nolambda --keep-dup all --slocal 10000
        run_macs2(input_files, output_path, args.genome, disable_parallel=args.no_parallel)
Example #4
0
def main():
    # Parse & interpret command line flags.
    parser = argparse.ArgumentParser(description='Intelligently merge fastq/fastq.gz files from an Illumina pipeline.'
                                     'Merges all L*_R*_* .fastq.gz files into one per sample.',
                                     epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at "
                                            "Washington University in St. Louis: http://gordonlab.wustl.edu.",
                                     usage='%(prog)s [options]',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str,
                        help='Input path.', required=True)
    parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str,
                        help='Output path.', required=True)
    parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true',
                        help='Disable parallel job spawning.')

    # parser.add_argument('--skip-stats', dest="skip_stats", action='store_true',
    #                    help='Skip statistics generation.', required=False)

    parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')

    parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
                        help="Do not create a log file.")

    args = parser.parse_args()

    output_path = _script_helpers.setup_output_path(args.output_path)

    _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)

    # Our goal is to intelligently merge .fastq/.fastq.gz output from an Illumina run
    # The Illumina standard pipeline splits by barcode w/ semi-predictable filenames we can use, e.g.
    # IsoA-M1-CD4_S1_L001_I1_001.fastq.gz # index (discard)
    # IsoA-M1-CD4_S1_L001_R1_001.fastq.gz # end 1, lane 1
    # IsoA-M1-CD4_S1_L001_R2_001.fastq.gz # end 2, lane 2
    # IsoA-M1-CD4_S1_L002_I1_001.fastq.gz # index (discard), lane 2
    # IsoA-M1-CD4_S1_L002_R1_001.fastq.gz # end 1, lane 2
    # ...

    # TODO: Move some lower glob code up so we can test these functions
    merge_strategy = fastq_map_predict(args.input_path, verbose=args.verbose)

    fastq_merge(merge_strategy, args.output_path, disable_parallel=args.no_parallel)
Example #5
0
def main():
    # Parse & interpret command line flags.
    parser = argparse.ArgumentParser(description='Given input .bam files, fix matepairs, remove duplicates, blacklist bad'
                                                 'regions, and sort the output.',
                                     epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at "
                                            "Washington University in St. Louis: http://gordonlab.wustl.edu.",
                                     usage='%(prog)s [options]',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str,
                        help='Input path.', required=True)
    parser.add_argument('--output-path', '-o', dest="output_path", metavar='output_dir/', type=str,
                        help='Output path.', required=True)
    parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str,
                        choices=['mm9', 'mm10', 'hg18', 'hg19'], help='Genome to use for blacklisting.', required=True)
    parser.add_argument('--no-parallel', '-np', dest="no_parallel", default=False, action='store_true',
                        help='Disable parallel job spawning.')

    parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')

    parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
                        help="Do not create a log file.")

    args = parser.parse_args()

    output_path = _script_helpers.setup_output_path(args.output_path)

    log_main = _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)


    # Samtools requires a temp directory for sorting /sometimes/.
    # This seems to only matter if it exceeds the in-ram limits set by the MAX_MEM parameter.
    # Sanity check the /tmp directory has a bit of space.
    temppath = tempfile.gettempdir()
    s = os.statvfs(temppath)
    if ((s.f_bavail * s.f_frsize) / (1024 * 1024)) < 10000:  # ~10 G, not for any good reason though
        log_main.warn('Temp directory %s doesn\'t have a lot of free space!' % (temppath))


    input_files = glob.glob(args.input_path + "/*.bam")  # Take ALL of the .bams.

    large_filter_fixmate_and_sort(input_files, args.genome, output_path, disable_parallel=args.no_parallel)
    rmdup_and_blacklist(input_files, args.genome, output_path, disable_parallel=args.no_parallel)
def main():
    # Parse & interpret command line flags.
    parser = argparse.ArgumentParser(
        description=
        'Run a number of standard peak calling algorithms for ATAC-seq data. '
        'Expects de-duplicated, sorted, merged, ChrM-removed data.',
        epilog=
        "Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at "
        "Washington University in St. Louis: https://gordonlab.wustl.edu.",
        usage='%(prog)s [options]',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--input-path',
                        '-i',
                        dest="input_path",
                        metavar='input_dir/',
                        type=str,
                        help='Input path (or a specific .bam file).',
                        required=True)
    parser.add_argument('--output-path',
                        '-o',
                        dest="output_path",
                        metavar='output_dir/',
                        type=str,
                        help='Output path.',
                        required=True)
    parser.add_argument(
        '--genome',
        '-g',
        dest="genome",
        metavar='genome',
        type=str,
        choices=['ms', 'mm', 'ce', 'dm'],
        help='Genome size to pass to MACS.',
        required=True)  # TODO: Consider using mm9/mm10, etc. for uniformity?
    parser.add_argument('--no-parallel',
                        '-np',
                        dest="no_parallel",
                        default=False,
                        action='store_true',
                        help='Disable parallel job spawning.')

    parser.add_argument(
        '--skip-bam-indexing',
        dest="skip_bam_indexing",
        action='store_true',
        help=
        'Skip bam indexing (You must have generated indexes independently for peak callers to work!).',
        required=False)

    parser.add_argument('--skip-macs14',
                        dest="skip_macs14",
                        action='store_true',
                        help='Skip MACS v1.4 peak calling.',
                        required=False)
    parser.add_argument('--skip-macs2',
                        dest="skip_macs2",
                        action='store_true',
                        help='Skip MACS v2 peak calling.',
                        required=False)

    parser.add_argument("--verbose",
                        "-v",
                        dest="verbose",
                        default=False,
                        action='store_true')

    parser.add_argument("--no-log",
                        "-nl",
                        dest="nolog",
                        default=False,
                        action='store_true',
                        help="Do not create a log file.")

    args = parser.parse_args()

    output_path = _script_helpers.setup_output_path(args.output_path)

    log_main = _logshim.startLogger(verbose=args.verbose,
                                    noFileLog=args.nolog,
                                    outPath=output_path)

    input_files = _script_helpers.validate_input_files(args.input_path)

    # Generate BAM indexes
    if not args.skip_bam_indexing:
        generate_index(input_files,
                       output_path,
                       disable_parallel=args.no_parallel)
    else:
        log_main.warn("Skipping bam index .bai generation as requested.")
        log_main.warn(
            "You must have generated these separately, otherwise peak callers will fail."
        )

    if not args.skip_macs14:
        # Start with old-school MACS 1.4
        run_macs14(input_files,
                   output_path,
                   args.genome,
                   disable_parallel=args.no_parallel)

    if not args.skip_macs2:
        # Now new MACS 2
        # macs2 callpeak --nomodel -t $BAM -n $OUT --nolambda --keep-dup all --slocal 10000
        run_macs2(input_files,
                  output_path,
                  args.genome,
                  disable_parallel=args.no_parallel)
Example #7
0
def main():
    # Parse & interpret command line flags.
    parser = argparse.ArgumentParser(description='Convert hdf5 tables from bamliquidator format to CSV counts tables '
                                                 'for use in R and elsewhere. (Necessary as rhdf5 doesn\'t support our data structure.)',
                                     epilog="Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at "
                                            "Washington University in St. Louis: http://gordonlab.wustl.edu.",
                                     usage='%(prog)s [options]',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--input-path', '-i', dest="input_path", metavar='input_dir/', type=str,
                        help='Input path with .h5 files.',
                        required=True)

    parser.add_argument("--overwrite", dest="overwrite", default=False, action='store_true',
                        help='Regenerate and overwrite output .tsv files, even if they already exist.')

    parser.add_argument('--call-genes', dest="call_genes", default=False, action='store_true',
                        help='Instead of a .tsv (with positions as keys), make a .annotated.tsv with nearby genes.')

    parser.add_argument('--normalized', dest="normalized", default=False, action='store_true',
                        help='Store the normalized counts (counts/total reads) instead of the raw read counts.')

    parser.add_argument('--density', dest="density", default=False, action='store_true',
                        help='Store the width-normalized density (counts/total reads/region size) instead of the raw read counts.')

    parser.add_argument('--sizescaled', dest="sizescaled", default=False, action='store_true',
                        help='Store the size scaled counts (counts/feature size) instead of the raw read counts.')

    # Useful for EdgeR/DESeq2, etc. where every locus/position/gene-name must be unique.
    parser.add_argument('--flatten', dest="flatten", default=False, action='store_true',
                        help='Aggregate identical locus IDs and sum their values. '
                             'Think carefully before you sum non-normalized values!')


    genome_choices = sorted(CONFIG['gffs'].keys())
    parser.add_argument('--genome', '-g', dest="genome", metavar='genome', type=str, default=None,
                        choices=genome_choices, help='Genome to use for annotation, one of: %s' % (', '.join(genome_choices)), required=False)


    parser.add_argument("--verbose", "-v", dest="verbose", default=False, action='store_true')

    parser.add_argument("--no-log", "-nl", dest="nolog", default=False, action='store_true',
                        help="Do not create a log file.")

    args = parser.parse_args()

    if args.call_genes and not args.genome:
        parser.error('--genome is when requesting --call_genes')

    assert((args.density + args.normalized + args.sizescaled) <= 1)

    annotationBedTool = None
    if args.call_genes:
        genome_gff = CONFIG['gffs'][args.genome]
        assert(os.access(genome_gff, os.R_OK))
        annotationBedTool = pybedtools.BedTool(genome_gff)

    # Output path is input path. This also checks that the path is writeable.
    output_path = _script_helpers.setup_output_path(args.input_path)

    _logshim.startLogger(verbose=args.verbose, noFileLog=args.nolog, outPath=output_path)


    input_files = get_input_files(args.input_path)

    parse_h5files(input_files,
                  annotationBedTool=annotationBedTool,
                  overwrite=args.overwrite,
                  flatten=args.flatten,
                  density=args.density,
                  normalized=args.normalized,
                  sizescaled=args.sizescaled)
def main():
    # Parse & interpret command line flags.
    parser = argparse.ArgumentParser(
        description='Given paired-end .fastq/.fastq.gz files, map to a genome.',
        epilog=
        "Written by Nick Semenkovich <*****@*****.**> for the Gordon Lab at "
        "Washington University in St. Louis: https://gordonlab.wustl.edu.",
        usage='%(prog)s [options]',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--input-path',
                        '-i',
                        dest="input_path",
                        metavar='input_dir/',
                        type=str,
                        help='Input path.',
                        required=True)
    parser.add_argument('--output-path',
                        '-o',
                        dest="output_path",
                        metavar='output_dir/',
                        type=str,
                        help='Output path.',
                        required=True)
    parser.add_argument('--genome',
                        '-g',
                        dest="genome",
                        metavar='genome',
                        type=str,
                        choices=['mm9', 'mm10', 'hg18', 'hg19'],
                        help='Genome to use for bowtie2',
                        required=True)
    parser.add_argument('--no-parallel',
                        '-np',
                        dest="no_parallel",
                        default=False,
                        action='store_true',
                        help='Disable parallel job spawning.')

    parser.add_argument("--verbose",
                        "-v",
                        dest="verbose",
                        default=False,
                        action='store_true')

    parser.add_argument("--no-log",
                        "-nl",
                        dest="nolog",
                        default=False,
                        action='store_true',
                        help="Do not create a log file.")

    args = parser.parse_args()

    output_path = _script_helpers.setup_output_path(args.output_path)

    _logshim.startLogger(verbose=args.verbose,
                         noFileLog=args.nolog,
                         outPath=output_path)

    paired_end_mapping = find_paired_ends(args.input_path,
                                          verbose=args.verbose)

    run_bowtie2(paired_end_mapping, args.genome, output_path)