Beispiel #1
0
def main():
    def add_args(parser):
        parser.add_argument("-c",
                            "--chromosomes",
                            action="extend_overwrite",
                            type="str_list",
                            default=[str(c)
                                     for c in xrange(1, 20)] + ["X", "M"])
        parser.add_argument("-m", "--mt_window_size", type=int, default=100)
        parser.add_argument("-w", "--window_size", type=int, default=1000000)
        parser.add_argument(
            "--start",
            type=int,
            default=3000000,
            help="Start of first window for autosomal and X chromosomes.")
        parser.add_argument("genome_file", type="readable_file")
        parser.add_argument("output_file", type="writeable_file")

    args = parse(add_args)

    genome = csv_to_dict(args.genome_file, delim="\t")

    with open(args.output_file, "w") as o:
        for chrm in args.chromosomes:
            chrm_name = "chr{0}".format(chrm)
            chrm_size = int(genome[chrm_name][1])
            window_size = args.mt_window_size if chrm == "M" else args.window_size

            for pos in xrange(args.start, chrm_size, window_size):
                if pos < chrm_size:
                    o.write("{0}\t{1}\t{2}\n".format(
                        chrm_name, pos, min(pos + window_size, chrm_size)))
Beispiel #2
0
def main():
    def parse_args(parser):
        parser.add_argument("-c", "--cleanup", action="store_true", default=False)
        parser.add_argument("-w", "--working_dir", type="writeable_dir", default=None)
        parser.add_argument("--pandoc_exe", default="pandoc")
        parser.add_argument("--pdflatex_exe", default="pdflatex")
        parser.add_argument("--open_pdf", action="store_true", default=False)
        parser.add_argument("dir", type="readable_dir")
        parser.add_argument("outfile", type="writeable_file")
    
    args = parse(parse_args)
    
    latex_dir = os.path.join(args.dir, "latex")
    assert os.path.exists(latex_dir), "Missing directory: {0}".format(latex_dir)
    
    section_dir = os.path.join(args.dir, "sections")
    assert os.path.exists(section_dir), "Missing directory: {0}".format(section_dir)
    
    bib_dir = os.path.join(args.dir, "bib")
    assert os.path.exists(bib_dir), "Missing directory: {0}".format(bib_dir)
    
    pdf = compile_pdf(latex_dir, section_dir, bib_dir, args.outfile, args.working_dir, 
        args.cleanup, args.pandoc_exe, args.pdflatex_exe)
    
    if args.open_pdf:
        bash("open {0} &".format(pdf), catch=False)
Beispiel #3
0
def main(argv=None):
    def add_args(parser):
        parser.add_argument('-a',
                            '--append',
                            action='store_true',
                            default=False)
        parser.add_argument('-h', '--header', default=0,
            help="Either an integer that is the number of header lines in the input file "\
                 "(use negative numbers to suppress copying of headers to output files) or "\
                 "a string that should be used as the header.")
        parser.add_argument('-P',
                            '--pattern_file',
                            type='readable_file',
                            metavar="FILE")
        parser.add_argument('-p',
                            '--pattern',
                            type='mapping',
                            action='append',
                            metavar="PATTERN=OUTPUT")
        parser.add_argument(
            '-u',
            '--unmatched_file',
            type='writeable_file',
            metavar="FILE",
            help="File in which to write lines that do not match any pattern.")
        parser.add_argument('file', type='readable_file', metavar="FILE")

    ns = parse(add_args, args=argv)

    header = None
    if isinstance(ns.header, str):
        if ns.header.isdigit():
            header = int(ns.header)
        else:
            header = [ns.header]

    if ns.pattern_file:
        mod = load_module_from_file(ns.pattern_file)
        if not hasattr(mod, 'handlers'):
            raise Exception("Invalid pattern file: %s" % ns.pattern_file)
        else:
            handlers = mod.handlers
    else:
        handlers = (DefaultFileHandler(m[0], m[1], ns.append)
                    for m in ns.pattern)

    if handlers:
        unmatched = None
        if ns.unmatched_file:
            unmatched = open(ns.unmatched_file, 'w')
        try:
            split(ns.file, handlers, header, unmatched)
        finally:
            if unmatched: unmatched.close()
    else:
        print "No handlers specified; doing nothing."
Beispiel #4
0
def main(argv=None):
    def add_opts(parser):
        parser.add_argument("-d", "--delim", default=",")
        parser.add_argument("-r", "--replace", default=":-")
        parser.add_argument("infile", type='readable_file')
        parser.add_argument("outfile", type='writeable_file')

    ns = parse(add_opts, args=argv)

    with open(ns.infile, 'rU') as i, open(ns.outfile, 'w') as o:
        convert(i, o, ns.delim, ns.replace)
Beispiel #5
0
def main(argv=None):
    def add_opts(parser):
        parser.add_argument("-d", "--delim", default=",")
        parser.add_argument("-r", "--replace", default=":-")
        parser.add_argument("infile", type='readable_file')
        parser.add_argument("outfile", type='writeable_file')
    
    ns = parse(add_opts, args=argv)
    
    with open(ns.infile, 'rU') as i, open(ns.outfile, 'w') as o:
        convert(i, o, ns.delim, ns.replace)
Beispiel #6
0
def main():
    def add_opts(parser):
        parser.add_argument("fasta", type="readable_file")
        parser.add_argument("seqs", type="readable_file")
        parser.add_argument("outdir", type="writeable_dir")
    
    args = parse(add_opts)
    
    with open(args.fasta, "rU") as f:
        for record in SeqIO.parse(f, "fasta"):
            outfile = os.path.join(args.outdir, "{0}.txt".format(record.id))
            cmd = 'grep "{0}" "{1}" > "{2}"'.format(record.seq, args.seqs, outfile)
            bash(cmd)
Beispiel #7
0
def main(argv=None):
    def add_opts(parser):
        parser.add_argument("-d", "--decimal", action="store_true", default=False)
        parser.add_argument("-w", "--window_size", default=100, type=int)
        parser.add_argument("infile", type='readable_file')
        parser.add_argument("outfile", type='writeable_file')
    
    ns = parse(add_opts, args=argv)
    
    with open(ns.infile, 'rU') as inp:
        seq = inp.read().replace("\n","")

    cg_pct(seq, ns.outfile, ns.window_size, ns.decimal)
Beispiel #8
0
def main():
    def add_args(parser):
        parser.add_argument('infile', type='readable_file', metavar='FILE')
        parser.add_argument('outdir', type='writeable_dir', default='.', nargs='?')
    
    ns = parse(add_args)
    
    outfile = None
    for line in fileinput.input(ns.infile):
        if line[0] == ">":
            if outfile is not None:
                outfile.close()
            fname = os.path.join(ns.outdir, "{0}.fasta".format(line[1:-1]))
            outfile = open(fname, 'w')
        outfile.write(line)
Beispiel #9
0
def main(argv=None):
    def add_opts(parser):
        parser.add_argument("-d",
                            "--decimal",
                            action="store_true",
                            default=False)
        parser.add_argument("-w", "--window_size", default=100, type=int)
        parser.add_argument("infile", type='readable_file')
        parser.add_argument("outfile", type='writeable_file')

    ns = parse(add_opts, args=argv)

    with open(ns.infile, 'rU') as inp:
        seq = inp.read().replace("\n", "")

    cg_pct(seq, ns.outfile, ns.window_size, ns.decimal)
Beispiel #10
0
def main():
    def add_arguments(parser):
        parser.add_argument('-d', '--delim', default="\t")
        parser.add_argument('infile', type='readable_file')
        parser.add_argument('termfile', type='readable_file')
        parser.add_argument('outfile', type='writeable_file', nargs='?', default=None)
    ns = parse(add_arguments)
    
    with open(ns.infile, 'rU') as f: 
        s = f.read()
    
    for find, repl in csv_to_table(ns.termfile, ns.delim):
        s = re.sub(find, repl, s)
    
    outfile = ns.outfile or ns.infile
    write_file(outfile, s)
Beispiel #11
0
def main():
    def add_args(parser):
        parser.add_argument('infile', type='readable_file', metavar='FILE')
        parser.add_argument('outdir',
                            type='writeable_dir',
                            default='.',
                            nargs='?')

    ns = parse(add_args)

    outfile = None
    for line in fileinput.input(ns.infile):
        if line[0] == ">":
            if outfile is not None:
                outfile.close()
            fname = os.path.join(ns.outdir, "{0}.fasta".format(line[1:-1]))
            outfile = open(fname, 'w')
        outfile.write(line)
Beispiel #12
0
def main(argv=None):
    def add_args(parser):
        parser.add_argument('-a', '--append', action='store_true', default=False)
        parser.add_argument('-h', '--header', default=0,
            help="Either an integer that is the number of header lines in the input file "\
                 "(use negative numbers to suppress copying of headers to output files) or "\
                 "a string that should be used as the header.")
        parser.add_argument('-P', '--pattern_file', type='readable_file', metavar="FILE")
        parser.add_argument('-p', '--pattern', type='mapping', action='append', 
            metavar="PATTERN=OUTPUT")
        parser.add_argument('-u', '--unmatched_file', type='writeable_file', metavar="FILE",
            help="File in which to write lines that do not match any pattern.")
        parser.add_argument('file', type='readable_file', metavar="FILE")
    
    ns = parse(add_args, args=argv)
    
    header = None
    if isinstance(ns.header, str):
        if ns.header.isdigit():
            header = int(ns.header)
        else:
            header = [ns.header]
    
    if ns.pattern_file:
        mod = load_module_from_file(ns.pattern_file)
        if not hasattr(mod, 'handlers'):
            raise Exception("Invalid pattern file: %s" % ns.pattern_file)
        else:
            handlers = mod.handlers
    else:
        handlers = (DefaultFileHandler(m[0], m[1], ns.append) for m in ns.pattern)

    if handlers:
        unmatched = None
        if ns.unmatched_file:
            unmatched = open(ns.unmatched_file, 'w')
        try:
            split(ns.file, handlers, header, unmatched)
        finally:
            if unmatched: unmatched.close()
    else:
        print "No handlers specified; doing nothing."
def main():
    def add_arguments(parser):
        parser.add_argument('-d', '--delim', default="\t")
        parser.add_argument('infile', type='readable_file')
        parser.add_argument('termfile', type='readable_file')
        parser.add_argument('outfile',
                            type='writeable_file',
                            nargs='?',
                            default=None)

    ns = parse(add_arguments)

    with open(ns.infile, 'rU') as f:
        s = f.read()

    for find, repl in csv_to_table(ns.termfile, ns.delim):
        s = re.sub(find, repl, s)

    outfile = ns.outfile or ns.infile
    write_file(outfile, s)
Beispiel #14
0
                                metavar="COMMAND",
                                default=None,
                                help="Command to run on piped process output.")
        output_group = parser.add_mutually_exclusive_group()
        output_group.add_argument("-r", "--result_file", type="writeable_file", metavar="FILE",
            help="File where all results are written in CSV format (one column for each variable "\
                "value followed by a column with the result). All fields are quoted.")
        output_group.add_argument(
            "-R",
            "--result_file_pattern",
            metavar="PATTERN",
            help=
            "Pattern from which output file is created by interpolation with variable values."
        )

    ns = parse(add_arguments, args=sys.argv[1:i])
    prog = sys.argv[i]
    args = sys.argv[i + 1:]

    argvars = VarArgGenerator()

    if ns.var_file:
        config = SafeConfigParser()
        config.read(ns.var_file)

        if config.has_section('constants'):
            argvars.update(config.items('constants'))

        if config.has_section['variables']:
            argvars.update(
                dict(parse_vars(m) for m in config.items('variables')))
Beispiel #15
0
def main():
    def add_args(parser):
        parser.add_argument("-k",
                            "--kmer_sizes",
                            type="int_list",
                            action="extend_overwrite",
                            default=(1, ))
        parser.add_argument("-r", "--read_length", type=int, default=100)
        parser.add_argument("--bigint", action="store_true", default=False)
        parser.add_argument("--log", type="writeable_file", default=None)
        parser.add_argument("--log_interval", type=int, default=100000)
        parser.add_argument("--prefix", default=None)
        parser.add_argument("--summary_only",
                            action="store_true",
                            default=False)
        parser.add_argument("input_file", type="readable_file")
        parser.add_argument("output_dir",
                            type="writeable_dir",
                            nargs="?",
                            default=".")

    args = parse(add_args)

    dtype = np.int64 if args.bigint else np.int32

    total_mem = sum(
        SeqDiv.estimate_memory(args.read_length, k, dtype)
        for k in args.kmer_sizes) / 1000000.0
    sys.stderr.write(
        "This program will use up to {0} MiB of memory\n".format(total_mem))

    kmers = dict((k, SeqDiv(k, dtype)) for k in args.kmer_sizes)

    log = None
    start = None
    if args.log is not None:
        from datetime import datetime
        start = datetime.now()
        log = open(args.log, "w", 0)
        log.write("Starting at {0}\n".format(start))
        log.flush()

    for read_num, read in enumerate(
            fileinput.input(args.input_file, mode="rU"), 1):
        # TODO: this could be threaded if it's too slow
        read = read.strip()
        read_len = len(read)
        if read_len != args.read_length:
            sys.exit(
                "Invalid read length at read {0}: expected {1}, actual {2}".
                format(read_num, args.read_length, read_len))
        for sd in kmers.values():
            sd.insert_nocheck(read, read_len)

        if log is not None and read_num % args.log_interval == 0:
            now = datetime.now()
            log.write("Processed {0} reads in {1} hours\n".format(
                read_num, round((now - start).total_seconds() / 3600, 3)))
            log.flush()

    prefix = args.prefix
    if prefix is None:
        prefix = os.path.splitext(os.path.basename(args.input_file))[0]

    for k in kmers.keys():
        summary_file = os.path.join(
            args.output_dir, "{0}_summary_{1}mers.csv".format(prefix, k))
        with open(summary_file, "w") as o:
            kmers[k].write_summary(o)
        if not args.summary_only:
            count_file = os.path.join(
                args.output_dir, "{0}_counts_{1}mers.csv".format(prefix, k))
            with open(count_file, "w") as o:
                kmers[k].write_counts(o)

    if log is not None:
        log.close()
Beispiel #16
0
def main():
    def add_args(parser):
        parser.add_argument("-b", "--bfile", type=readable_file_group(("bed", "bim", "fam")),
            default=None, help="Prefix of plink bfiles (bed, bim and fam).")
        parser.add_argument("-c", "--chromosomes", type=delimited_macro("chrm"), default="F",
            help="Set of chromosomes on which to execute the command(s)")
        parser.add_argument("-f", "--fork_mode", choices=("test","serial","thread","lsf"),
            default="thread", help="How to distribute jobs.")
        parser.add_argument("-i", "--interval_size", type=int, default=1000, metavar="bp",
            help="Bin size for interval statistics (in bp)")
        parser.add_argument("-k", "--keep_file", type="readable_file", metavar="FILE", default=None,
            help="Plink keep file (list of samples to include).")
        parser.add_argument("-s", "--plot_stats", action="extend_overwrite", type="str_list", 
            metavar="LIST", default=("mean","max","pct"),
            help="List of stats for which to create (mean, max, pct)")
        parser.add_argument("-S", "--summary_stat", action="extend_overwrite", 
            choices=("all", "unlinked"), default="unlinked",
            help="How to compute bin summary stats (use all markers or only unlinked markers)")
        parser.add_argument("-p", "--percentile", type=int, default=95, metavar="PCT",
            help="Percentile for r-squared statistics.")
        parser.add_argument("-r", "--r2_bin_size", type=float, default=0.01,
            help="Bin size for r-squared histogram.")
        parser.add_argument("-w", "--window_size", type=int, default=500, metavar="Kb",
            help="Window size (in kb)")
        parser.add_argument("--fork_opts", action="extend_dict", type="mapping_list", default={},
            help="Options specific to the fork mode.")
        parser.add_argument("--unfiltered", action="store_true", default=False,
            help="Assume data files are unfiltered and apply uncalled and maf filters.")
        parser.add_argument("--no_unlinked", action="store_true", default=False,
            help="Don't compute stats for unlinked markers.")
        parser.add_argument("--per_chromosome", action="store_true", default=False,
            help="Whether the data file has been split into one per chromosome.")
        parser.add_argument("--block_r2_thresholds", type=float, nargs=2, default=(0.2, 0.95),
            help="Lower and upper r-square thresholds for identifying haplotype blocks with 'make_blocks' command.")
        parser.add_argument("--output_format", choices=("ped","bed","tped"), default="ped",
            help="Output format for commands that produce PLINK data files.")
        parser.add_argument("--new_plink", default="/Users/johndidion/software/plink_mac/plink")
        parser.add_argument("--old_plink", default="plink")
        parser.add_argument("outdir", type="writeable_dir")
        parser.add_argument("commands", action="extend", nargs="+", choices=COMMANDS,
            help="Commands to run. If none are specified, all will be run.")
    
    ns = parse(add_args)
    
    geno.plink.PLINK_CMD = ns.new_plink
    geno.plink.PLINK_OLD = ns.old_plink
    
    if ns.bfile:
        bedfile, bimfile, famfile = ns.bfile
        bfile = os.path.splitext(bedfile)[0]
        fname = os.path.basename(bfile)

    window_size = ns.window_size * 1000
    commands = ns.commands if ns.commands else COMMANDS
    
    mkdir(ns.outdir, overwrite=False)
    
    if any(c in FORK_COMMANDS for c in commands):
        executor = get_executor(ns.fork_mode, ns.fork_opts)
    
    # partition the genome into windows and generate a list of SNPs in each window
    if "snps" in commands:
        # this command is not forked but is not time-consuming
        chr_file, win_file = partition_snps(bimfile, ns.outdir, window_size)
    else:
        win_file = os.path.join(ns.outdir, "windows.csv")
    
    # execute the plink --r2 command over snp windows (requires 'snps' command)
    if "plink" in commands:
        cmd_iter = pairwise_ld_command_iter(bfile, ns.outdir, win_file, apply_filters=ns.unfiltered)
        exec_shell(cmd_iter, executor, error_handler=reraise_error)
    
    # process the results of the 'plink' command
    if "process" in commands:
        # this command is not forked. it should be submitted to lsf if run on kure.
        process_ld_files(ns.outdir, ns.percentile, ns.interval_size, ns.r2_bin_size, unlinked=not ns.no_unlinked)
    
    # generate heatmap plots from the results of the 'process' command
    if "plot" in commands:
        bins = os.path.join(ns.outdir, "window_summary.csv")
        for s in ns.plot_stats:
            summary_stat = "{0}_{1}".format(ns.summary_stat, s)
            plot_ld_heatmap(
                os.path.join(ns.outdir, "{0}_r2_matrix.csv".format(s)), bins, 
                os.path.join(ns.outdir, "{0}_heatmap.pdf".format(s)), 
                window_size=window_size, summary_stat=summary_stat, tics=True)
    
    if "local" in commands:
        # TODO: execute commands for local LD
        pass
    
    if "process_local" in commands:
        process_local_ld_file(os.path.join(ns.outdir, "local.ld"), os.path.join(ns.outdir, "r2_hist.csv"))
    
    per_chrm = ns.per_chromosome
    
    # split a whole-genome data file into one file per chromosome
    if "split" in commands:
        cmd_iter = split(bfile, ns.outdir, chromosomes=ns.chromosomes, output_format=ns.output_format)
        exec_shell(cmd_iter, executor, error_handler=reraise_error)
        per_chrm = True
    
    # execute the plink --blocks command
    if "blocks" in commands:
        # this command is only forked if run on split files (1 per chr). it should be submitted to lsf if run on kure.
        if per_chrm:
            cmd_iter = per_chrm_iter(bfile, ns.outdir, format_ld_blocks_command, ns.chromosomes, apply_filters=ns.unfiltered)
            exec_shell(cmd_iter, executor, error_handler=reraise_error)
        else:
            cmd = format_ld_blocks_command(bfile, ns.outdir, apply_filters=ns.unfiltered)
            bash(cmd)

    if "block_ld" in commands:
        if per_chrm:
            path = os.path.join(ns.outdir, "**", "*.blocks.det")
        else:
            path = os.path.join(ns.outdir, "*.blocks.det")
        
        lines = fileinput.input(glob.glob(path))
        cmd_iter = block_ld(lines, bfile, ns.outdir)
        exec_shell(cmd_iter, executor, error_handler=reraise_error)
        
    if "prune_blocks" in commands:
        prune_blocks(ns.outdir, ns.chromosomes)
    
    if "make_blocks" in commands:
        def arg_iter():
            for chrm in ns.chromosomes:
                yield (chrm, os.path.join(ns.outdir, "chr{0}".format(chrm), fname))
        distribute(call_make_blocks, arg_iter(), mode=ns.fork_mode, **ns.fork_opts)
        
    if "tag_snps" in commands:
        bash(format_merge_tag_lists_command(ns.outdir, fname, ns.chromosomes))
Beispiel #17
0
def main():
    def add_args(parser):
        parser.add_argument("-b",
                            "--bfile",
                            type=readable_file_group(("bed", "bim", "fam")),
                            default=None,
                            help="Prefix of plink bfiles (bed, bim and fam).")
        parser.add_argument(
            "-c",
            "--chromosomes",
            type=delimited_macro("chrm"),
            default="F",
            help="Set of chromosomes on which to execute the command(s)")
        parser.add_argument("-f",
                            "--fork_mode",
                            choices=("test", "serial", "thread", "lsf"),
                            default="thread",
                            help="How to distribute jobs.")
        parser.add_argument("-i",
                            "--interval_size",
                            type=int,
                            default=1000,
                            metavar="bp",
                            help="Bin size for interval statistics (in bp)")
        parser.add_argument(
            "-k",
            "--keep_file",
            type="readable_file",
            metavar="FILE",
            default=None,
            help="Plink keep file (list of samples to include).")
        parser.add_argument(
            "-s",
            "--plot_stats",
            action="extend_overwrite",
            type="str_list",
            metavar="LIST",
            default=("mean", "max", "pct"),
            help="List of stats for which to create (mean, max, pct)")
        parser.add_argument(
            "-S",
            "--summary_stat",
            action="extend_overwrite",
            choices=("all", "unlinked"),
            default="unlinked",
            help=
            "How to compute bin summary stats (use all markers or only unlinked markers)"
        )
        parser.add_argument("-p",
                            "--percentile",
                            type=int,
                            default=95,
                            metavar="PCT",
                            help="Percentile for r-squared statistics.")
        parser.add_argument("-r",
                            "--r2_bin_size",
                            type=float,
                            default=0.01,
                            help="Bin size for r-squared histogram.")
        parser.add_argument("-w",
                            "--window_size",
                            type=int,
                            default=500,
                            metavar="Kb",
                            help="Window size (in kb)")
        parser.add_argument("--fork_opts",
                            action="extend_dict",
                            type="mapping_list",
                            default={},
                            help="Options specific to the fork mode.")
        parser.add_argument(
            "--unfiltered",
            action="store_true",
            default=False,
            help=
            "Assume data files are unfiltered and apply uncalled and maf filters."
        )
        parser.add_argument("--no_unlinked",
                            action="store_true",
                            default=False,
                            help="Don't compute stats for unlinked markers.")
        parser.add_argument(
            "--per_chromosome",
            action="store_true",
            default=False,
            help="Whether the data file has been split into one per chromosome."
        )
        parser.add_argument(
            "--block_r2_thresholds",
            type=float,
            nargs=2,
            default=(0.2, 0.95),
            help=
            "Lower and upper r-square thresholds for identifying haplotype blocks with 'make_blocks' command."
        )
        parser.add_argument(
            "--output_format",
            choices=("ped", "bed", "tped"),
            default="ped",
            help="Output format for commands that produce PLINK data files.")
        parser.add_argument(
            "--new_plink",
            default="/Users/johndidion/software/plink_mac/plink")
        parser.add_argument("--old_plink", default="plink")
        parser.add_argument("outdir", type="writeable_dir")
        parser.add_argument(
            "commands",
            action="extend",
            nargs="+",
            choices=COMMANDS,
            help="Commands to run. If none are specified, all will be run.")

    ns = parse(add_args)

    geno.plink.PLINK_CMD = ns.new_plink
    geno.plink.PLINK_OLD = ns.old_plink

    if ns.bfile:
        bedfile, bimfile, famfile = ns.bfile
        bfile = os.path.splitext(bedfile)[0]
        fname = os.path.basename(bfile)

    window_size = ns.window_size * 1000
    commands = ns.commands if ns.commands else COMMANDS

    mkdir(ns.outdir, overwrite=False)

    if any(c in FORK_COMMANDS for c in commands):
        executor = get_executor(ns.fork_mode, ns.fork_opts)

    # partition the genome into windows and generate a list of SNPs in each window
    if "snps" in commands:
        # this command is not forked but is not time-consuming
        chr_file, win_file = partition_snps(bimfile, ns.outdir, window_size)
    else:
        win_file = os.path.join(ns.outdir, "windows.csv")

    # execute the plink --r2 command over snp windows (requires 'snps' command)
    if "plink" in commands:
        cmd_iter = pairwise_ld_command_iter(bfile,
                                            ns.outdir,
                                            win_file,
                                            apply_filters=ns.unfiltered)
        exec_shell(cmd_iter, executor, error_handler=reraise_error)

    # process the results of the 'plink' command
    if "process" in commands:
        # this command is not forked. it should be submitted to lsf if run on kure.
        process_ld_files(ns.outdir,
                         ns.percentile,
                         ns.interval_size,
                         ns.r2_bin_size,
                         unlinked=not ns.no_unlinked)

    # generate heatmap plots from the results of the 'process' command
    if "plot" in commands:
        bins = os.path.join(ns.outdir, "window_summary.csv")
        for s in ns.plot_stats:
            summary_stat = "{0}_{1}".format(ns.summary_stat, s)
            plot_ld_heatmap(os.path.join(ns.outdir,
                                         "{0}_r2_matrix.csv".format(s)),
                            bins,
                            os.path.join(ns.outdir,
                                         "{0}_heatmap.pdf".format(s)),
                            window_size=window_size,
                            summary_stat=summary_stat,
                            tics=True)

    if "local" in commands:
        # TODO: execute commands for local LD
        pass

    if "process_local" in commands:
        process_local_ld_file(os.path.join(ns.outdir, "local.ld"),
                              os.path.join(ns.outdir, "r2_hist.csv"))

    per_chrm = ns.per_chromosome

    # split a whole-genome data file into one file per chromosome
    if "split" in commands:
        cmd_iter = split(bfile,
                         ns.outdir,
                         chromosomes=ns.chromosomes,
                         output_format=ns.output_format)
        exec_shell(cmd_iter, executor, error_handler=reraise_error)
        per_chrm = True

    # execute the plink --blocks command
    if "blocks" in commands:
        # this command is only forked if run on split files (1 per chr). it should be submitted to lsf if run on kure.
        if per_chrm:
            cmd_iter = per_chrm_iter(bfile,
                                     ns.outdir,
                                     format_ld_blocks_command,
                                     ns.chromosomes,
                                     apply_filters=ns.unfiltered)
            exec_shell(cmd_iter, executor, error_handler=reraise_error)
        else:
            cmd = format_ld_blocks_command(bfile,
                                           ns.outdir,
                                           apply_filters=ns.unfiltered)
            bash(cmd)

    if "block_ld" in commands:
        if per_chrm:
            path = os.path.join(ns.outdir, "**", "*.blocks.det")
        else:
            path = os.path.join(ns.outdir, "*.blocks.det")

        lines = fileinput.input(glob.glob(path))
        cmd_iter = block_ld(lines, bfile, ns.outdir)
        exec_shell(cmd_iter, executor, error_handler=reraise_error)

    if "prune_blocks" in commands:
        prune_blocks(ns.outdir, ns.chromosomes)

    if "make_blocks" in commands:

        def arg_iter():
            for chrm in ns.chromosomes:
                yield (chrm,
                       os.path.join(ns.outdir, "chr{0}".format(chrm), fname))

        distribute(call_make_blocks,
                   arg_iter(),
                   mode=ns.fork_mode,
                   **ns.fork_opts)

    if "tag_snps" in commands:
        bash(format_merge_tag_lists_command(ns.outdir, fname, ns.chromosomes))
Beispiel #18
0
def main(argv=None):
    def add_opts(parser):
        parser.add_argument('-H', '--homozygosity_cutoff', type=int, default=97,
            help="Percent homozygosity required to declare a region homozygous.")
        parser.add_argument('-k', '--karyotype_file', type='readable_file', default=None,
            help="File containing the karyotype.")
        parser.add_argument('-m', '--smoothing_size', type=int, default=20,
            help="Window size to use when smoothing regions.")
        parser.add_argument('-s', '--sample_file', type='readable_file', default=None,
            help="File containing sample information to ")
        parser.add_argument('-w', '--window_size', type=int, default=300,
            help="Number of markers in sliding classification window.")
        parser.add_argument('-W', '--window_slide', type=int, default=1,
            help="Number of markers to slide the window.")
        parser.add_argument('-x', '--exclude_file', type='readable_file', default=None,
            help="File containing regions to ignore.")
        parser.add_argument('--genotype_format', choices=['num', 'bin'], default='num',
            help="Format of genotypes: num = -1/1/2/3/4, bin = -1=N, 0=hom, 1=het.")
        parser.add_argument('--max_smoothing_iterations', type=int, default=100,
            help="Maximimum iterations to spend smoothing a sequence.")
        parser.add_argument('genotype_file', type='readable_file',
            help="File containing genotypes for samples, one sample per row.")
        parser.add_argument('output_dir', type='writeable_dir',
            help="Directory to write results, one file per chromosome.")

    ns = parse(add_opts, args=argv)
    if log.is_debug(): log.debug("find_intervals.py called with args: {0}".format(ns))
    
    samples = read_samples(ns.sample_file)
    if log.is_debug(): log.debug("{0} samples".format(len(samples)))
    
    exclude = read_exclude(ns.exclude_file)
    if log.is_debug():
        if exclude:
            log.debug("Exclude regions on chromosomes {0}".format(exclude.keys()))
        else:
            log.debug("No exclude regions")
        
    chrom_sizes = dict((c.name, c.size) for c in karyotype(ns.karyotype_file))
    chromosomes = []
    ordered_samples = []
    genotypes = {}
    
    with open(ns.genotype_file, 'rU') as f:
        r = reader(f)
        
        # create list of tuples: SNPID, chromosome, position
        head_iter = zipiter(r.next() for i in xrange(0, 3))
        head_iter.next() # remove header column
        snps = map(lambda x: SNP(*x), head_iter)
        nsnp = len(snps)
        if log.is_debug(): log.debug("{0} snps".format(nsnp))

        chrom = None
        start = None
        for i in xrange(0, nsnp):
            s = snps[i]
            if s.chromosome != chrom:
                if start:
                    chromosomes.append(Chromosome(chrom, chrom_sizes[chrom], start, i))
                chrom = s.chromosome
                start = i
        chromosomes.append(Chromosome(chrom, chrom_sizes[chrom], start, nsnp))
        if log.is_debug(): log.debug("{0} chromosomes".format(len(chromosomes)))
        
        for sample in r:
            name = sample.pop(0) # pop off header column
            ordered_samples.append(samples.get(name, name))
            
            for chrom in chromosomes:
                genotypes.setdefault(chrom.name, []).append(
                    Genotypes(chrom.slice(sample), ns.genotype_format))
                
    min_hom = float(ns.homozygosity_cutoff) / 100
    smoother = Smoother(ns.smoothing_size, ns.max_smoothing_iterations)
    
    if not os.path.exists(ns.output_dir):
        os.makedirs(ns.output_dir)
    
    for chrom in chromosomes:
        log.info("Processing chromosome {0}".format(chrom.name))

        # break genotypes into regions, scan each region using a sliding window
        # and classify each window as homozygous or heterozygous
        regions = classify_windows(View(snps, chrom), genotypes[chrom.name], 
            wrapiter(exclude.get(chrom.name, None)), 
            ns.window_size, ns.window_slide, min_hom, ns.smoothing_size)
        
        int_file = os.path.join(ns.output_dir, "intervals_chr%s.csv" % chrom.name)
        min_file = os.path.join(ns.output_dir, "minimal_chr%s.csv" % chrom.name)
        
        # smooth out each region and partition into hom/het intervals
        with open(int_file, 'w') as iout, open(min_file, 'w') as mout:
            int_writer = writer(iout)
            int_writer.writerow(('Sample','Start','End','Martkers','Call'))
            
            min_writer = writer(mout)
            min_writer.writerow(['Start','End','Markers'] + ordered_samples)
            
            for r in regions:
                partition(r, ordered_samples, smoother, int_writer, min_writer)
Beispiel #19
0
            help="Define a variable whose values come frome FILE")
        parser.add_argument("-w", "--sliding_window", metavar="NAME=LOW,HIGH,STEP", action="dict",
            type="delimited_mapping", help="Define a sliding window variable.")
        pipe_group = parser.add_argument_group("pipes", pipe_help)
        pipe_group.add_argument("--pipe_file_pattern", metavar="PATTERN", default=None,
            help="Process output file to send to pipe_command.")
        pipe_group.add_argument("--pipe_command_pattern", metavar="COMMAND", default=None,
            help="Command to run on piped process output.")
        output_group = parser.add_mutually_exclusive_group()
        output_group.add_argument("-r", "--result_file", type="writeable_file", metavar="FILE",
            help="File where all results are written in CSV format (one column for each variable "\
                "value followed by a column with the result). All fields are quoted.")
        output_group.add_argument("-R", "--result_file_pattern", metavar="PATTERN",
            help="Pattern from which output file is created by interpolation with variable values.")

    ns = parse(add_arguments, args=sys.argv[1:i])
    prog = sys.argv[i]
    args = sys.argv[i+1:]

    argvars = VarArgGenerator()

    if ns.var_file:
        config = SafeConfigParser()
        config.read(ns.var_file)

        if config.has_section('constants'):
            argvars.update(config.items('constants'))

        if config.has_section['variables']:
            argvars.update(dict(parse_vars(m) for m in config.items('variables')))