Example #1
0
    def getRunStatement(self, infile, outfile, controlfile):
        """
        Generate a specific run statement for each peakcaller class
        """
        # select location of the spp script to run
        if self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "default":
            executable = IOTools.which("run_spp.R")
        elif self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "nodups":
            executable = IOTools.which("run_spp_nodups.R")
        else:
            executable = self.PARAMS_PEAKCALLER["spp_options_idr_script"]
            try:
                os.path.exists(executable)
            except:
                raise IOError("SPP script not found: %s" % executable)

        # select the threshold for lax peak calling
        if self.PARAMS_PEAKCALLER["spp_options_npeaks"]:
            if self.PARAMS_PEAKCALLER["spp_options_fdr"]:
                raise Exception("Value specified for both SPP options"
                                " -npeaks and -fdr please select one or"
                                " other option, but not both")
            else:
                threshold = "-npeaks=" + \
                    str(self.PARAMS_PEAKCALLER["spp_options_npeaks"])
        elif self.PARAMS_PEAKCALLER["spp_options_fdr"]:
            threshold = "-fdr=" + \
                str(self.PARAMS_PEAKCALLER["spp_options_fdr"])
        else:
            raise Exception("Must specify a value for either"
                            " spp_options_npeaks or spp_options_fdr,"
                            " but not both")

        # build run statement for spp.
        # -savn is output.npeak.file (passed as NULL,
        #                             means filename based on infile)
        # -out is output.result.file
        # -odir defaults to os.path.dirname( infile )
        # -savn is save narrowpeak file
        # -savr is save regionpeak file
        #  (run_spp.R script throws an error if region peak is not output).
        statement = [("Rscript %(executable)s"
                      " -c=%(infile)s"
                      " -i=%(controlfile)s"
                      " %(threshold)s"
                      " -savn"
                      " -savr")]

        # add additional options
        statement.append(self.PARAMS_PEAKCALLER["spp_options_parameters"])

        # specify outfile
        statement.append(" -rf"
                         " -out=/stats/phantomPeakStatsReps.tab"
                         " >& %(outfile)s")

        statement = (" ".join(statement) % locals())

        return statement
Example #2
0
    def getRunStatement(self, infile, outfile, controlfile):
        """
        Generate a specific run statement for each peakcaller class
        """
        # select location of the spp script to run
        if self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "default":
            executable = IOTools.which("run_spp.R")
        elif self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "nodups":
            executable = IOTools.which("run_spp_nodups.R")
        else:
            executable = self.PARAMS_PEAKCALLER["spp_options_idr_script"]
            try:
                os.path.exists(executable)
            except:
                raise IOError("SPP script not found: %s" % executable)

        # select the threshold for lax peak calling
        if self.PARAMS_PEAKCALLER["spp_options_npeaks"]:
            if self.PARAMS_PEAKCALLER["spp_options_fdr"]:
                raise Exception("Value specified for both SPP options"
                                " -npeaks and -fdr please select one or"
                                " other option, but not both")
            else:
                threshold = "-npeaks=" + \
                    str(self.PARAMS_PEAKCALLER["spp_options_npeaks"])
        elif self.PARAMS_PEAKCALLER["spp_options_fdr"]:
            threshold = "-fdr=" + \
                str(self.PARAMS_PEAKCALLER["spp_options_fdr"])
        else:
            raise Exception("Must specify a value for either"
                            " spp_options_npeaks or spp_options_fdr,"
                            " but not both")

        # build run statement for spp.
        # -savn is output.npeak.file (passed as NULL,
        #                             means filename based on infile)
        # -out is output.result.file
        # -odir defaults to os.path.dirname( infile )
        # -savn is save narrowpeak file
        # -savr is save regionpeak file
        #  (run_spp.R script throws an error if region peak is not output).
        statement = [("Rscript %(executable)s"
                      " -c=%(infile)s"
                      " -i=%(controlfile)s"
                      " %(threshold)s"
                      " -savn"
                      " -savr")]

        # add additional options
        statement.append(self.PARAMS_PEAKCALLER["spp_options_parameters"])

        # specify outfile
        statement.append(" -rf"
                         " -out=/stats/phantomPeakStatsReps.tab"
                         " >& %(outfile)s")

        statement = (" ".join(statement) % locals())

        return statement
def runControlCPC(infile, outfile):
    # farm.py is called from within cpc.sh
    assert IOTools.which("farm.py"), "farm.py needs to be in $PATH for cpc to run"
    # Default cpc parameters don't work with later versions of blast
    E.info("Running cpc with blast version:%s" % IOTools.which("blastx"))

    result_evidence = P.snip(outfile, ".result") + ".evidence"
    working_dir = "lncRNA_control/cpc"
    statement = ("%(pipeline_scriptsdir)s/cpc.sh"
                 " %(infile)s"
                 " %(outfile)s"
                 " %(working_dir)s"
                 " %(result_evidence)s")
    P.run()
def runControlCPC(infile, outfile):
    # farm.py is called from within cpc.sh
    assert IOTools.which(
        "farm.py"), "farm.py needs to be in $PATH for cpc to run"
    # Default cpc parameters don't work with later versions of blast
    E.info("Running cpc with blast version:%s" % IOTools.which("blastx"))

    result_evidence = P.snip(outfile, ".result") + ".evidence"
    working_dir = "lncRNA_control/cpc"
    statement = ("%(pipeline_scriptsdir)s/cpc.sh"
                 " %(infile)s"
                 " %(outfile)s"
                 " %(working_dir)s"
                 " %(result_evidence)s")
    P.run()
Example #5
0
def checkExecutables(filenames):
    """check for the presence/absence of executables"""

    missing = []

    for filename in filenames:
        if not IOTools.which(filename):
            missing.append(filename)

    if missing:
        raise ValueError("missing executables: %s" % ",".join(missing))
Example #6
0
def checkExecutables(filenames):
    """check for the presence/absence of executables"""

    missing = []

    for filename in filenames:
        if not IOTools.which(filename):
            missing.append(filename)

    if missing:
        raise ValueError("missing executables: %s" % ",".join(missing))
Example #7
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version="%prog version: $Id: psl2wiggle.py 2834 2009-11-24 16:11:23Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-b", "--output-filename-pattern", dest="output_filename", type="string",
                      help="filename for output [default=%default]")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig"),
                      help="output format [default=%default]")

    parser.set_defaults(genome_file=None,
                        typecode=numpy.int16,
                        output_filename=None,
                        output_format="wiggle",
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    typecode = options.typecode

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        counts = {}
        contig_sizes = fasta.getContigSizes(with_synonyms=False)
        E.info("allocating memory for %i contigs and %i bytes" %
               (len(contig_sizes), sum(contig_sizes.values()) * typecode().itemsize))
        for contig, size in contig_sizes.items():
            E.debug("allocating %s: %i bases" % (contig, size))
            counts[contig] = numpy.zeros(size, typecode)

        E.info("allocated memory for %i contigs" % len(fasta))

    else:
        fasta = None
        contig_sizes = {}

    if options.output_format in ("bigwig", "bigbed"):

        if not options.genome_file:
            raise ValueError(
                "please supply genome file for bigwig/bigbed computation.")

        if not options.output_filename:
            raise ValueError(
                "please output file for bigwig/bigbed computation.")

        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        elif options.output_format == "bigbed":
            executable_name = "bedToBigBed"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        executable = IOTools.which(executable_name)

        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        tmpdir = tempfile.mkdtemp()
        E.debug("temporary files are in %s" % tmpdir)

        tmpfile_wig = os.path.join(tmpdir, "wig")
        tmpfile_sizes = os.path.join(tmpdir, "sizes")

        # write contig sizes
        outfile_size = open(tmpfile_sizes, "w")
        for contig, size in contig_sizes.items():
            outfile_size.write("%s\t%s\n" % (contig, size))
        outfile_size.close()

        outfile = open(tmpfile_wig, "w")

    else:
        outfile = options.stdout

    iterator = Blat.BlatIterator(sys.stdin)

    ninput, ncontigs, nskipped = 0, 0, 0

    E.info("started counting")

    while 1:

        if options.test and ninput >= options.test:
            break

        match = iterator.next()

        if match is None:
            break

        ninput += 1

        contig = match.mSbjctId

        for start, length in zip(match.mSbjctBlockStarts, match.mBlockSizes):
            counts[contig][start:start + length] += 1

    E.info("finished counting")

    if options.output_format in ("wig", "bigwig"):
        E.info("starting wig output")

        for contig, vals in counts.items():

            E.debug("output for %s" % contig)
            for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]):
                l = list(iter)
                start, end = l[0][0], l[-1][0]
                val = vals[start]
                if val > 0:
                    outfile.write("variableStep chrom=%s span=%i\n" %
                                  (contig, end - start + 1))
                    outfile.write("%i\t%i\n" % (start, val))

            ncontigs += 1
    elif options.output_format in ("bedgraph", "bigbed"):

        E.info("starting bedgraph output")

        for contig, vals in counts.items():
            E.debug("output for %s" % contig)
            for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]):
                l = list(iter)
                start, end = l[0][0], l[-1][0]
                val = vals[start]
                if val > 0:
                    outfile.write("%s\t%i\t%i\t%i\n" %
                                  (contig, start, end + 1, val))

            ncontigs += 1

    E.info("finished output")

    if options.output_format in ("bigwig", "bigbed"):
        outfile.close()

        E.info("starting bigwig conversion")
        try:
            retcode = subprocess.call(" ".join((executable,
                                                tmpfile_wig,
                                                tmpfile_sizes,
                                                os.path.abspath(options.output_filename)), ),
                                      shell=True)
            if retcode < 0:
                warn("wigToBigWig terminated with signal: %i" % -retcode)
                return -retcode
        except OSError, msg:
            warn("Error while executing bigwig: %s" % e)
            return 1

        shutil.rmtree(tmpdir)

        E.info("finished bigwig conversion")
Example #8
0
def main(argv=None):
    """script main.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig",
                               "bed"),
                      help="output format [default=%default]")

    parser.add_option("-b",
                      "--output-filename",
                      dest="output_filename",
                      type="string",
                      help="filename for output [default=%default]")

    parser.add_option("-s",
                      "--shift",
                      dest="shift",
                      type="int",
                      help="shift reads by a certain amount (ChIP-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extend",
                      type="int",
                      help="extend reads by a certain amount "
                      "(ChIP-Seq) [%default]")

    parser.add_option("-p",
                      "--span",
                      dest="span",
                      type="int",
                      help="span of a window in wiggle tracks "
                      "[%default]")

    parser.add_option("-m",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge paired-ended reads into a single "
                      "bed interval [default=%default]. ")

    parser.add_option("--max-insert-size",
                      dest="max_insert_size",
                      type="int",
                      help="only merge if insert size less that "
                      "# bases. 0 turns of this filter "
                      "[default=%default].")

    parser.add_option("--min-insert-size",
                      dest="min_insert_size",
                      type="int",
                      help="only merge paired-end reads if they are "
                      "at least # bases apart. "
                      "0 turns of this filter. [default=%default]")

    parser.set_defaults(
        samfile=None,
        output_format="wiggle",
        output_filename=None,
        shift=0,
        extend=0,
        span=1,
        merge_pairs=None,
        min_insert_size=0,
        max_insert_size=0,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) >= 1:
        options.samfile = args[0]

    if not options.samfile:
        raise ValueError("please provide a bam file")

    if len(args) == 2:
        options.output_filename = args[1]

    samfile = pysam.Samfile(options.samfile, "rb")

    contig_sizes = dict(zip(samfile.references, samfile.lengths))

    if options.shift or options.extend:
        if options.output_format != "bigwig":
            raise ValueError(
                "shift and extend only available for bigwig output")

    if options.output_format in ("bigwig", "bigbed"):

        if not options.output_filename:
            raise ValueError(
                "please output file for bigwig/bigbed computation.")

        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        elif options.output_format == "bigbed":
            executable_name = "bedToBigBed"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        executable = IOTools.which(executable_name)

        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        tmpdir = tempfile.mkdtemp()
        E.debug("temporary files are in %s" % tmpdir)

        tmpfile_wig = os.path.join(tmpdir, "wig")
        tmpfile_sizes = os.path.join(tmpdir, "sizes")

        # write contig sizes
        outfile_size = open(tmpfile_sizes, "w")
        for contig, size in contig_sizes.items():
            outfile_size.write("%s\t%s\n" % (contig, size))
        outfile_size.close()

        outfile = open(tmpfile_wig, "w")
        E.info("starting output to %s" % tmpfile_wig)

    else:
        outfile = options.stdout
        E.info("starting output to stdout")

    if options.output_format in ("wiggle", "bigwig"):
        # wiggle is one-based, so add 1, also step-size is 1, so need
        # to output all bases
        if options.span == 1:
            outf = lambda outfile, contig, start, end, val: \
                outfile.write(
                    "".join(["%i\t%i\n" % (x, val)
                             for x in xrange(start + 1, end + 1)]))
        else:
            outf = SpanWriter(options.span)

    elif options.output_format in ("bed", "bigbed"):
        # bed is 0-based, open-closed
        outf = lambda outfile, contig, start, end, val: \
            outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val))

    ninput, nskipped, ncontigs = 0, 0, 0

    output_filename = options.output_filename
    if output_filename:
        output_filename = os.path.abspath(output_filename)

    if options.shift > 0 or options.extend > 0 or options.merge_pairs:
        # Workflow 1: convert to bed intervals and use bedtools
        # genomecov to build a coverage file.
        # Convert to bigwig with UCSC tools bedGraph2BigWig

        if options.merge_pairs:
            E.info("merging pairs to temporary file")
            counter = _bam2bed.merge_pairs(
                samfile,
                outfile,
                min_insert_size=options.min_insert_size,
                max_insert_size=options.max_insert_size,
                bed_format=3)
        else:
            # create bed file with shifted tags
            shift, extend = options.shift, options.extend
            shift_extend = shift + extend

            for contig in samfile.references:
                E.debug("output for %s" % contig)
                lcontig = contig_sizes[contig]

                for read in samfile.fetch(contig):
                    pos = read.pos
                    if read.is_reverse:
                        start = max(0, read.pos + read.alen - shift_extend)
                    else:
                        start = max(0, read.pos + shift)

                    # intervals extending beyond contig are removed
                    if start >= lcontig:
                        continue

                    end = min(lcontig, start + extend)
                    outfile.write("%s\t%i\t%i\n" % (contig, start, end))

        outfile.close()

        tmpfile_bed = os.path.join(tmpdir, "bed")
        E.info("computing coverage")
        # calculate coverage - format is bedgraph
        statement = """bedtools genomecov -bg -i %(tmpfile_wig)s
        -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals()
        E.run(statement)

        E.info("converting to bigwig")
        tmpfile_sorted = os.path.join(tmpdir, "sorted")
        statement = """sort -k 1,1 -k2,2n %(tmpfile_bed)s
        > %(tmpfile_sorted)s;
        bedGraphToBigWig %(tmpfile_sorted)s
        %(tmpfile_sizes)s
        %(output_filename)s""" % locals()
        E.run(statement)

        shutil.rmtree(tmpdir)

    else:
        # Workflow 2: use pysam column iterator to build a
        # wig file. Then convert to bigwig of bedgraph file
        # with UCSC tools.
        def column_iter(iterator):

            start = None
            end = 0
            n = None
            for t in iterator:
                if t.pos - end > 1 or n != t.n:
                    if start is not None:
                        yield start, end, n
                    start = t.pos
                    end = t.pos
                    n = t.n
                end = t.pos
            yield start, end, n

        for contig in samfile.references:
            # if contig != "chrX": continue
            E.debug("output for %s" % contig)
            lcontig = contig_sizes[contig]

            if options.output_format in ("wiggle", "bigwig"):
                outfile.write("variableStep chrom=%s span=%i\n" %
                              (contig, options.span))

            for start, end, val in column_iter(samfile.pileup(contig)):

                # patch: there was a problem with bam files and reads
                # overextending at the end. These are usually Ns, but
                # need to check as otherwise wigToBigWig fails.
                if lcontig <= end:
                    E.warn("read extending beyond contig: %s: %i > %i" %
                           (contig, end, lcontig))
                    end = lcontig
                    if start >= end:
                        continue

                if val > 0:
                    outf(outfile, contig, start, end, val)
            ncontigs += 1

        if type(outf) == type(SpanWriter):
            outf.flush(outfile)

        E.info("finished output")

        E.info("ninput=%i, ncontigs=%i, nskipped=%i" %
               (ninput, ncontigs, nskipped))

        if options.output_format in ("bigwig", "bigbed"):
            outfile.close()

            E.info("starting %s conversion" % executable)
            try:
                retcode = subprocess.call(" ".join(
                    (executable, tmpfile_wig, tmpfile_sizes, output_filename)),
                                          shell=True)
                if retcode != 0:
                    E.warn("%s terminated with signal: %i" %
                           (executable, -retcode))
                    return -retcode
            except OSError, msg:
                E.warn("Error while executing bigwig: %s" % e)
                return 1

            shutil.rmtree(tmpdir)

            E.info("finished bigwig conversion")
Example #9
0
def checkDepedencies(pipeline):

    # check existence of pipeline script
    if not os.access(pipeline, os.R_OK):
        raise IOError("Pipeline %s was not found\n" % pipeline)

    if os.path.isdir(pipeline):
        raise IOError("The given input is a folder, and must be a script\n")

    # parse pipeline script
    with open(pipeline) as f:
        tree = ast.parse(f.read())

    # list to store all statements = ''' <commands> '''
    statements = []

    # inspired by
    # https://docs.python.org/3/library/ast.html#module-ast
    # http://bit.ly/2rDf5xu
    # http://bit.ly/2r0Uv9t
    for node in ast.walk(tree):
        if type(node) is ast.Assign and \
           hasattr(node, 'targets') and \
           hasattr(node.targets[0], 'id') and \
           node.targets[0].id == "statement" and \
           hasattr(node.value, 's'):
            statement = node.value.s
            # clean up statement, code copied from Execution module of Pipeline.py
            statement = " ".join(re.sub("\t+", " ",
                                        statement).split("\n")).strip()
            if statement.endswith(";"):
                statement = statement[:-1]
            statements.append(statement)

    # dictionary where:
    # key = program name
    # value = number of times it has been called
    deps = {}

    # set of names that are not proper deps
    exceptions = [
        'create', 'drop', 'select', 'attach', 'insert', 'module', 'checkpoint',
        'for'
    ]

    for statement in statements:
        for command in statement.split("|"):
            # take program name, thanks http://pythex.org/
            groups = re.match("^\s*([\w|\-|\.]+)", command)
            if groups is not None:
                # program name is first match
                prog_name = groups.group(0)
                # clean up duplicated white spaces
                prog_name = ' '.join(prog_name.split())
                # filter exceptions
                if prog_name.lower() not in exceptions:
                    if prog_name not in deps:
                        deps[prog_name] = 1
                    else:
                        deps[prog_name] += 1

    # list of unmet dependencies
    check_path_failures = []

    # print dictionary ordered by value
    for k in sorted(deps, key=deps.get, reverse=True):
        if IOTools.which(k) is None:
            check_path_failures.append(k)

    return deps, check_path_failures
Example #10
0
def main(argv=None):
    """script main.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-o", "--output-format", dest="output_format",
                      type="choice",
                      choices=(
                          "bedgraph", "wiggle", "bigbed",
                          "bigwig", "bed"),
                      help="output format [default=%default]")

    parser.add_option("-b", "--output-filename",
                      dest="output_filename", type="string",
                      help="filename for output [default=%default]")

    parser.add_option("-s", "--shift", dest="shift", type="int",
                      help="shift reads by a certain amount (ChIP-Seq) "
                      "[%default]")

    parser.add_option("-e", "--extend", dest="extend", type="int",
                      help="extend reads by a certain amount "
                      "(ChIP-Seq) [%default]")

    parser.add_option("-p", "--span", dest="span", type="int",
                      help="span of a window in wiggle tracks "
                      "[%default]")

    parser.add_option("-m", "--merge-pairs", dest="merge_pairs",
                      action="store_true",
                      help="merge paired-ended reads into a single "
                      "bed interval [default=%default]. ")

    parser.add_option("--max-insert-size", dest="max_insert_size",
                      type="int",
                      help="only merge if insert size less that "
                      "# bases. 0 turns of this filter "
                      "[default=%default].")

    parser.add_option("--min-insert-size", dest="min_insert_size",
                      type="int",
                      help="only merge paired-end reads if they are "
                      "at least # bases apart. "
                      "0 turns of this filter. [default=%default]")

    parser.set_defaults(
        samfile=None,
        output_format="wiggle",
        output_filename=None,
        shift=0,
        extend=0,
        span=1,
        merge_pairs=None,
        min_insert_size=0,
        max_insert_size=0,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)
    if len(args) >= 1:
        options.samfile = args[0]
    if len(args) == 2:
        options.output_filename = args[1]
    if not options.samfile:
        raise ValueError("please provide a bam file")

    # Read BAM file using Pysam
    samfile = pysam.Samfile(options.samfile, "rb")

    # Create temporary files / folders
    tmpdir = tempfile.mkdtemp()
    E.debug("temporary files are in %s" % tmpdir)
    tmpfile_wig = os.path.join(tmpdir, "wig")
    tmpfile_sizes = os.path.join(tmpdir, "sizes")

    # Create dictionary of contig sizes
    contig_sizes = dict(zip(samfile.references, samfile.lengths))
    # write contig sizes
    outfile_size = open(tmpfile_sizes, "w")
    for contig, size in contig_sizes.items():
        outfile_size.write("%s\t%s\n" % (contig, size))
    outfile_size.close()

    # Shift and extend only available for bigwig format
    if options.shift or options.extend:
        if options.output_format != "bigwig":
            raise ValueError(
                "shift and extend only available for bigwig output")

    # Output filename required for bigwig / bigbed computation
    if options.output_format == "bigwig":
        if not options.output_filename:
            raise ValueError(
                "please specify an output file for bigwig computation.")

        # Define executable to use for binary conversion
        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        # check required executable file is in the path
        executable = IOTools.which(executable_name)
        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        # Open outout file
        outfile = open(tmpfile_wig, "w")
        E.info("starting output to %s" % tmpfile_wig)
    else:
        outfile = options.stdout
        E.info("starting output to stdout")

    # Set up output write functions
    if options.output_format in ("wiggle", "bigwig"):
        # wiggle is one-based, so add 1, also step-size is 1, so need
        # to output all bases
        if options.span == 1:
            outf = lambda outfile, contig, start, end, val: \
                outfile.write(
                    "".join(["%i\t%i\n" % (x, val)
                             for x in xrange(start + 1, end + 1)]))
        else:
            outf = SpanWriter(options.span)
    elif options.output_format == "bedgraph":
        # bed is 0-based, open-closed
        outf = lambda outfile, contig, start, end, val: \
            outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val))

    # initialise counters
    ninput, nskipped, ncontigs = 0, 0, 0

    # set output file name
    output_filename = options.output_filename
    if output_filename:
        output_filename = os.path.abspath(output_filename)

    # shift and extend or merge pairs. Output temp bed file
    if options.shift > 0 or options.extend > 0 or options.merge_pairs:
        # Workflow 1: convert to bed intervals and use bedtools
        # genomecov to build a coverage file.
        # Convert to bigwig with UCSC tools bedGraph2BigWig

        if options.merge_pairs:
            # merge pairs using bam2bed
            E.info("merging pairs to temporary file")
            counter = _bam2bed.merge_pairs(
                samfile,
                outfile,
                min_insert_size=options.min_insert_size,
                max_insert_size=options.max_insert_size,
                bed_format=3)
        else:
            # create bed file with shifted/extended tags
            shift, extend = options.shift, options.extend
            shift_extend = shift + extend

            for contig in samfile.references:
                E.debug("output for %s" % contig)
                lcontig = contig_sizes[contig]

                for read in samfile.fetch(contig):
                    pos = read.pos
                    if read.is_reverse:
                        start = max(0, read.pos + read.alen - shift_extend)
                    else:
                        start = max(0, read.pos + shift)

                    # intervals extending beyond contig are removed
                    if start >= lcontig:
                        continue

                    end = min(lcontig, start + extend)
                    outfile.write("%s\t%i\t%i\n" % (contig, start, end))

        outfile.close()

        # Convert bed file to coverage file (bedgraph)
        tmpfile_bed = os.path.join(tmpdir, "bed")
        E.info("computing coverage")
        # calculate coverage - format is bedgraph
        statement = """bedtools genomecov -bg -i %(tmpfile_wig)s
        -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals()
        E.run(statement)

        # Convert bedgraph to bigwig
        E.info("converting to bigwig")
        tmpfile_sorted = os.path.join(tmpdir, "sorted")
        statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;"
                     "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s "
                     "%(output_filename)s" % locals())
        E.run(statement)

    else:
        # Workflow 2: use pysam column iterator to build a
        # wig file. Then convert to bigwig of bedgraph file
        # with UCSC tools.
        def column_iter(iterator):
            start = None
            end = 0
            n = None
            for t in iterator:
                if t.pos - end > 1 or n != t.n:
                    if start is not None:
                        yield start, end, n
                    start = t.pos
                    end = t.pos
                    n = t.n
                end = t.pos
            yield start, end, n

        # Bedgraph track definition
        if options.output_format == "bedgraph":
            outfile.write("track type=bedGraph\n")

        for contig in samfile.references:
            # if contig != "chrX": continue
            E.debug("output for %s" % contig)
            lcontig = contig_sizes[contig]

            # Write wiggle header
            if options.output_format in ("wiggle", "bigwig"):
                outfile.write("variableStep chrom=%s span=%i\n" %
                              (contig, options.span))

            # Generate pileup per contig using pysam and iterate over columns
            for start, end, val in column_iter(samfile.pileup(contig)):
                # patch: there was a problem with bam files and reads
                # overextending at the end. These are usually Ns, but
                # need to check as otherwise wigToBigWig fails.
                if lcontig <= end:
                    E.warn("read extending beyond contig: %s: %i > %i" %
                           (contig, end, lcontig))
                    end = lcontig
                    if start >= end:
                        continue

                if val > 0:
                    outf(outfile, contig, start, end, val)
            ncontigs += 1

        # Close output file
        if type(outf) == type(SpanWriter):
            outf.flush(outfile)
        E.info("finished output")

        # Report counters
        E.info("ninput=%i, ncontigs=%i, nskipped=%i" %
               (ninput, ncontigs, nskipped))

        # Convert to binary formats
        if options.output_format == "bigwig":
            outfile.close()

            E.info("starting %s conversion" % executable)
            try:
                retcode = subprocess.call(" ".join((executable,
                                                    tmpfile_wig,
                                                    tmpfile_sizes,
                                                    output_filename)),
                                          shell=True)
                if retcode != 0:
                    E.warn("%s terminated with signal: %i" %
                           (executable, -retcode))
                    return -retcode
            except OSError, msg:
                E.warn("Error while executing bigwig: %s" % e)
                return 1
            E.info("finished bigwig conversion")
Example #11
0
 def isInstalled(self):
     path = IOTools.which(self.tool_definition['executable'])
     if path is None:
         return False
     return True
Example #12
0
 def isInstalled(self):
     path = IOTools.which(self.tool_definition['executable'])
     if path is None:
         return False
     return True
Example #13
0
def main(argv=None):
    """script main.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig",
                               "bed"),
                      help="output format [default=%default]")

    parser.add_option("-s",
                      "--shift-size",
                      dest="shift",
                      type="int",
                      help="shift reads by a certain amount (ChIP-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extend",
                      type="int",
                      help="extend reads by a certain amount "
                      "(ChIP-Seq) [%default]")

    parser.add_option("-p",
                      "--wiggle-span",
                      dest="span",
                      type="int",
                      help="span of a window in wiggle tracks "
                      "[%default]")

    parser.add_option("-m",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge paired-ended reads into a single "
                      "bed interval [default=%default].")

    parser.add_option("--scale-base",
                      dest="scale_base",
                      type="float",
                      help="number of reads/pairs to scale bigwig file to. "
                      "The default is to scale to 1M reads "
                      "[default=%default]")

    parser.add_option("--scale-method",
                      dest="scale_method",
                      type="choice",
                      choices=(
                          "none",
                          "reads",
                      ),
                      help="scale bigwig output. 'reads' will normalize by "
                      "the total number reads in the bam file that are used "
                      "to construct the bigwig file. If --merge-pairs is used "
                      "the number of pairs output will be used for "
                      "normalization. 'none' will not scale the bigwig file"
                      "[default=%default]")

    parser.add_option("--max-insert-size",
                      dest="max_insert_size",
                      type="int",
                      help="only merge if insert size less that "
                      "# bases. 0 turns of this filter "
                      "[default=%default].")

    parser.add_option("--min-insert-size",
                      dest="min_insert_size",
                      type="int",
                      help="only merge paired-end reads if they are "
                      "at least # bases apart. "
                      "0 turns of this filter. [default=%default]")

    parser.set_defaults(
        samfile=None,
        output_format="wiggle",
        shift=0,
        extend=0,
        span=1,
        merge_pairs=None,
        min_insert_size=0,
        max_insert_size=0,
        scale_method='none',
        scale_base=1000000,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if len(args) >= 1:
        options.samfile = args[0]
    if len(args) == 2:
        options.output_filename_pattern = args[1]
    if not options.samfile:
        raise ValueError("please provide a bam file")

    # Read BAM file using Pysam
    samfile = pysam.AlignmentFile(options.samfile, "rb")

    # Create temporary files / folders
    tmpdir = tempfile.mkdtemp()
    E.debug("temporary files are in %s" % tmpdir)
    tmpfile_wig = os.path.join(tmpdir, "wig")
    tmpfile_sizes = os.path.join(tmpdir, "sizes")

    # Create dictionary of contig sizes
    contig_sizes = dict(list(zip(samfile.references, samfile.lengths)))
    # write contig sizes
    outfile_size = IOTools.openFile(tmpfile_sizes, "w")
    for contig, size in sorted(contig_sizes.items()):
        outfile_size.write("%s\t%s\n" % (contig, size))
    outfile_size.close()

    # Shift and extend only available for bigwig format
    if options.shift or options.extend:
        if options.output_format != "bigwig":
            raise ValueError(
                "shift and extend only available for bigwig output")

    # Output filename required for bigwig / bigbed computation
    if options.output_format == "bigwig":
        if not options.output_filename_pattern:
            raise ValueError(
                "please specify an output file for bigwig computation.")

        # Define executable to use for binary conversion
        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        # check required executable file is in the path
        executable = IOTools.which(executable_name)
        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        # Open outout file
        outfile = IOTools.openFile(tmpfile_wig, "w")
        E.info("starting output to %s" % tmpfile_wig)
    else:
        outfile = IOTools.openFile(tmpfile_wig, "w")
        E.info("starting output to stdout")

    # Set up output write functions
    if options.output_format in ("wiggle", "bigwig"):
        # wiggle is one-based, so add 1, also step-size is 1, so need
        # to output all bases
        if options.span == 1:
            outf = lambda outfile, contig, start, end, val: \
                outfile.write(
                    "".join(["%i\t%i\n" % (x, val)
                             for x in range(start + 1, end + 1)]))
        else:
            outf = SpanWriter(options.span)
    elif options.output_format == "bedgraph":
        # bed is 0-based, open-closed
        outf = lambda outfile, contig, start, end, val: \
            outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val))

    # initialise counters
    ninput, nskipped, ncontigs = 0, 0, 0

    # set output file name
    output_filename_pattern = options.output_filename_pattern
    if output_filename_pattern:
        output_filename = os.path.abspath(output_filename_pattern)

    # shift and extend or merge pairs. Output temporay bed file
    if options.shift > 0 or options.extend > 0 or options.merge_pairs:
        # Workflow 1: convert to bed intervals and use bedtools
        # genomecov to build a coverage file.
        # Convert to bigwig with UCSC tools bedGraph2BigWig

        if options.merge_pairs:
            # merge pairs using bam2bed
            E.info("merging pairs to temporary file")
            counter = _bam2bed.merge_pairs(
                samfile,
                outfile,
                min_insert_size=options.min_insert_size,
                max_insert_size=options.max_insert_size,
                bed_format=3)
            E.info("merging results: {}".format(counter))
            if counter.output == 0:
                raise ValueError("no pairs output after merging")
        else:
            # create bed file with shifted/extended tags
            shift, extend = options.shift, options.extend
            shift_extend = shift + extend
            counter = E.Counter()

            for contig in samfile.references:
                E.debug("output for %s" % contig)
                lcontig = contig_sizes[contig]

                for read in samfile.fetch(contig):
                    pos = read.pos
                    if read.is_reverse:
                        start = max(0, read.pos + read.alen - shift_extend)
                    else:
                        start = max(0, read.pos + shift)

                    # intervals extending beyond contig are removed
                    if start >= lcontig:
                        continue

                    end = min(lcontig, start + extend)
                    outfile.write("%s\t%i\t%i\n" % (contig, start, end))
                    counter.output += 1

        outfile.close()

        if options.scale_method == "reads":
            scale_factor = float(options.scale_base) / counter.output

            E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" %
                   (options.scale_method, counter.output, scale_factor))
            scale = "-scale %f" % scale_factor
        else:
            scale = ""

        # Convert bed file to coverage file (bedgraph)
        tmpfile_bed = os.path.join(tmpdir, "bed")
        E.info("computing coverage")
        # calculate coverage - format is bedgraph
        statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s
        -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals()
        E.run(statement)

        # Convert bedgraph to bigwig
        E.info("converting to bigwig")
        tmpfile_sorted = os.path.join(tmpdir, "sorted")
        statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;"
                     "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s "
                     "%(output_filename_pattern)s" % locals())
        E.run(statement)

    else:

        # Workflow 2: use pysam column iterator to build a
        # wig file. Then convert to bigwig of bedgraph file
        # with UCSC tools.
        def column_iter(iterator):
            start = None
            end = 0
            n = None
            for t in iterator:
                if t.pos - end > 1 or n != t.n:
                    if start is not None:
                        yield start, end, n
                    start = t.pos
                    end = t.pos
                    n = t.n
                end = t.pos
            yield start, end, n

        if options.scale_method != "none":
            raise NotImplementedError(
                "scaling not implemented for pileup method")

        # Bedgraph track definition
        if options.output_format == "bedgraph":
            outfile.write("track type=bedGraph\n")

        for contig in samfile.references:
            # if contig != "chrX": continue
            E.debug("output for %s" % contig)
            lcontig = contig_sizes[contig]

            # Write wiggle header
            if options.output_format in ("wiggle", "bigwig"):
                outfile.write("variableStep chrom=%s span=%i\n" %
                              (contig, options.span))

            # Generate pileup per contig using pysam and iterate over columns
            for start, end, val in column_iter(samfile.pileup(contig)):
                # patch: there was a problem with bam files and reads
                # overextending at the end. These are usually Ns, but
                # need to check as otherwise wigToBigWig fails.
                if lcontig <= end:
                    E.warn("read extending beyond contig: %s: %i > %i" %
                           (contig, end, lcontig))
                    end = lcontig
                    if start >= end:
                        continue

                if val > 0:
                    outf(outfile, contig, start, end, val)
            ncontigs += 1

        # Close output file
        if type(outf) == type(SpanWriter):
            outf.flush(outfile)
        else:
            outfile.flush()

        E.info("finished output")

        # Report counters
        E.info("ninput=%i, ncontigs=%i, nskipped=%i" %
               (ninput, ncontigs, nskipped))

        # Convert to binary formats
        if options.output_format == "bigwig":
            outfile.close()

            E.info("starting %s conversion" % executable)
            try:
                retcode = subprocess.call(" ".join(
                    (executable, tmpfile_wig, tmpfile_sizes,
                     output_filename_pattern)),
                                          shell=True)
                if retcode != 0:
                    E.warn("%s terminated with signal: %i" %
                           (executable, -retcode))
                    return -retcode
            except OSError as msg:
                E.warn("Error while executing bigwig: %s" % msg)
                return 1
            E.info("finished bigwig conversion")
        else:
            with open(tmpfile_wig) as inf:
                sys.stdout.write(inf.read())

    # Cleanup temp files
    shutil.rmtree(tmpdir)

    E.Stop()