def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        jobs = []
        for bam_filename in bam_filenames:
            x = count_duplicates, (bam_filename,), {}
            jobs.append(x)
        results = parallel.pyfun(jobs, num_procs=num_cores)
        metadata["num_cores"] = num_cores
        assert len(results) == len(bam_filenames)

        handle = open(outfile, 'w')
        header = "Sample", "Duplicated Reads", "Total Reads", "% Duplicated"
        print >>handle, "\t".join(header)
        for i in range(len(bam_filenames)):
            x, sample, x = mlib.splitpath(bam_filenames[i])
            total_reads, dup_reads = results[i]
            perc_dup = float(dup_reads) / total_reads * 100
            perc_dup = "%.2f" % perc_dup
            x = sample, dup_reads, total_reads, perc_dup
            print >>handle, "\t".join(map(str, x))
       
        return metadata
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        bam_path = in_data.identifier
        assert os.path.exists(bam_path)
        assert os.path.isdir(bam_path)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        # Find all the BAM files.
        bam_filenames = filelib.list_files_in_path(
            bam_path, endswith=".bam", case_insensitive=True)

        jobs = []  # list of in_filename, out_filename
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            assert not os.path.exists(out_filename)
            x = in_filename, out_filename
            jobs.append(x)

        # Symlink the BAM files to the output path.
        for x in jobs:
            in_filename, out_filename = x
            os.symlink(in_filename, out_filename)

        # Index each of the files.
        sq = parallel.quote
        samtools = filelib.which_assert(config.samtools)
        commands = []
        for x in jobs:
            in_filename, out_filename = x
            cmd = [
                sq(samtools),
                "index",
                sq(out_filename),
                ]
            x = " ".join(cmd)
            commands.append(x)
        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores, path=out_path)

        # TODO: Check for output files.
        
        return metadata
Exemple #3
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import filelib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        # list of (in_filename, err_filename, out_filename)
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            sample, ext = os.path.splitext(f)
            err_filename = os.path.join(out_path, "%s.log" % sample)
            out_filename = os.path.join(out_path, "%s.pileup" % sample)
            x = in_filename, err_filename, out_filename
            jobs.append(x)

        # samtools mpileup -f [reference sequence] [BAM file(s)]
        #   > myData.mpileup
        samtools = mlib.findbin("samtools")
        sq = mlib.sq
        commands = []
        for x in jobs:
            in_filename, err_filename, out_filename = x

            x = [
                sq(samtools),
                "mpileup",
                "-f",
                sq(ref.fasta_file_full),
            ]
            x.append(sq(in_filename))
            x = " ".join(map(str, x))
            x = "%s 2> %s 1> %s" % (x, err_filename, out_filename)
            commands.append(x)
        parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        #from genomicode import hashlib
        from Betsy import module_utils

        in_filenames = module_utils.find_bam_files(in_data.identifier)
        assert in_filenames, "No .bam files."
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        jobs = []
        #seen = {}
        for i, in_filename in enumerate(in_filenames):
            p, f = os.path.split(in_filename)
            temp_prefix = "temp_%s" % f
            #temp_prefix = "temp_%s" % hashlib.hash_var(f)
            # Make sure no duplicates.
            #assert temp_prefix not in seen
            #seen[temp_prefix] = 1
            #temp_outfilename = "%d.bam" % i
            out_filename = os.path.join(out_path, f)
            x = filelib.GenericObject(
                in_filename=in_filename,
                temp_prefix=temp_prefix,
                #temp_outfilename=temp_outfilename,
                out_filename=out_filename)
            jobs.append(x)

        samtools = filelib.which_assert(config.samtools)

        # Calculate the number of threads per process.
        nc = module_utils.calc_max_procs_from_ram(4, upper_max=num_cores)
        num_threads = max(nc / len(jobs), 1)

        # Make a list of samtools commands.
        # Without -m, takes ~1 Gb per process.
        sq = parallel.quote
        commands = []
        for j in jobs:
            # Usage has changed.  Below no longer valid.
            # samtools sort <in_filename> <out_filestem>
            # .bam automatically added to <out_filestem>, so don't
            # need it.
            #x = out_filename
            #assert x.endswith(".bam")
            #x = x[:-4]
            #out_filestem = x

            x = [
                sq(samtools),
                "sort",
                "-O",
                "bam",
                "-T",
                sq(j.temp_prefix),
                "-m",
                "4G",  # Crashing, so try increasing memory.
                sq(j.in_filename),
                #"-o", sq(j.temp_outfilename),
                "-o",
                sq(j.out_filename),
            ]
            if num_threads > 1:
                x += ["-@", num_threads]
            x = " ".join(map(str, x))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = nc

        parallel.pshell(commands, max_procs=nc)
        #for cmd in commands:
        #    parallel.sshell(cmd)

        #for j in jobs:
        #    # Move the temporary files to the final location.
        #    shutil.move(j.temp_outfilename, j.out_filename)

        # Make sure the analysis completed successfully.
        x = [j.out_filename for j in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
Exemple #5
0
def main():
    import os
    import argparse
    import itertools

    from genomicode import filelib
    from genomicode import config
    from genomicode import parallel
    from genomicode import alignlib

    parser = argparse.ArgumentParser(description="")
    parser.add_argument("reference_genome", help="fasta file")

    parser.add_argument("-j",
                        dest="num_procs",
                        type=int,
                        default=1,
                        help="Number of jobs to run in parallel.")
    parser.add_argument(
        "--dry_run",
        action="store_true",
        help="Just display the commands, and don't generate the alignment.")
    parser.add_argument("--window",
                        default=80,
                        type=int,
                        help="Number of bases in alignment.  Default: 80")

    group = parser.add_argument_group(title="Input")
    group.add_argument("--bam_file", help="Indexed BAM file.")
    group.add_argument("--bam_path", help="Path to BAM files.")
    group.add_argument(
        "--position",
        action="append",
        default=[],
        help="Specify a position to view, "
        "e.g. chr20:45,927,663 or chr20:45927663.  1-based coordinates")
    group.add_argument("--position_file",
                       help="Tab-delimited text file with two columns.  "
                       "Column 1 is chromosome, column 2 is position.")

    group = parser.add_argument_group(title="Output")
    group.add_argument("--prefix", help="Pre-pend a prefix to each outfile.")
    group.add_argument(
        "--outpath",
        help="If multiple alignments are generated, this option "
        "directs where to save the output files.")
    group.add_argument(
        "--noclobber",
        action="store_true",
        help="If an output file already exists, don't overwrite it.")

    # Parse the input arguments.
    args = parser.parse_args()
    filelib.assert_exists_nz(args.reference_genome)
    assert args.bam_file or args.bam_path, \
           "Either --bam_file or --bam_path must be provided."
    assert not (args.bam_file and args.bam_path), \
           "Cannot specify both --bam_file or --bam_path."
    if args.bam_file:
        filelib.assert_exists_nz(args.bam_file)
    if args.bam_path:
        assert os.path.exists(args.bam_path)
    if args.position_file:
        filelib.assert_exists_nz(args.position_file)
    if args.outpath and not os.path.exists(args.outpath):
        os.mkdir(args.outpath)
    if args.num_procs < 1 or args.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")
    assert args.window >= 1 and args.window < 500

    bam_filenames = []
    if args.bam_file:
        bam_filenames.append(args.bam_file)
    else:
        x = os.listdir(args.bam_path)
        x = [x for x in x if x.endswith(".bam")]
        x = [os.path.join(args.bam_path, x) for x in x]
        bam_filenames = x
    assert bam_filenames, "No bam files found."

    positions = []  # list of (chrom, pos)
    for x in args.position:
        chrom, pos = _parse_position(x)
        positions.append((chrom, pos))
    if args.position_file and os.path.exists(args.position_file):
        for cols in filelib.read_cols(args.position_file):
            assert len(cols) == 2, "Position file should have 2 columns"
            chrom, pos = cols
            pos = int(pos)
            assert pos >= 1
            positions.append((chrom, pos))
    assert positions, "No positions specified."

    # Make the commands.
    assert hasattr(config, "samtools")
    filelib.assert_exists(config.samtools)

    # Make sure we have the right version of samtools.
    # 1.2 (using htslib 1.2.1)
    # 0.1.18 (r982:295)
    version = alignlib.get_samtools_version()
    x = version.split(".")
    assert len(x) >= 2
    major = x[0]
    assert major in ["0", "1"], "Unknown samtools version: %s" % version
    major = int(major)
    assert major >= 1, "Requires samtools >= 1 (Current version: %s)" % version

    commands = []
    for x in itertools.product(bam_filenames, positions):
        bam_filename, (chrom, pos) = x

        p, f = os.path.split(bam_filename)
        sample, e = os.path.splitext(f)

        left = max(pos - args.window / 2, 1)
        pos_str = "%s:%s" % (chrom, left)

        x = "%2s.%9s.%s.html" % (chrom, pos, sample)
        if args.prefix:
            x = "%s.%s" % (args.prefix, x)
        if args.outpath:
            x = os.path.join(args.outpath, x)
        out_filename = x

        if args.noclobber and os.path.exists(out_filename):
            continue

        # samtools tview -d t -p 7:100550778 bam01/196B-lung.bam $FA
        sq = parallel.quote
        x = [
            sq(config.samtools),
            "tview",
            "-d",
            "h",
            "-p",
            pos_str,
            sq(bam_filename),
            sq(args.reference_genome),
        ]
        x = " ".join(x)
        x = "%s >& %s" % (x, sq(out_filename))
        commands.append(x)

    if args.dry_run:
        for x in commands:
            print x
        return

    parallel.pshell(commands, max_procs=args.num_procs)