def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("track\n")

    for infile in infiles:
        if IOTools.is_empty(infile):
            continue
        motif = P.snip(infile, ".meme")
        outf.write("%s\n" % motif)

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
Exemple #2
0
def loadMotifInformation(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("motif\n")
 
    for infile in infiles:
        if IOTools.is_empty(infile):
            continue
        motif = P.snip(infile, ".motif")
        outf.write("%s\n" % motif)

    outf.close()


    P.load(outf.name, outfile, "--allow-empty-file")

    os.unlink(outf.name)
Exemple #3
0
def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("method\ttrack\n")

    for infile in infiles:
        if IOTools.is_empty(infile):
            continue
        method = re.match("(.+).dir/", infile).groups()[0]
        track = os.path.basename(".".join(infile.split(".")[:-1]))
        outf.write("%s\t%s\n" % (method,track))

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
def loadMemeChipSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.get_temp_file(".")

    outf.write("track\tnpeaks\twidth\tmasking\tpath\n")

    for infile in infiles:
        if IOTools.is_empty(infile):
            continue
        fn = P.snip(os.path.basename(infile), ".memechip")

        track, npeaks, width, masking = fn.split(".")
        outf.write("\t".join(map(str, (track, npeaks, width, masking, fn))) +
                   "\n")

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that all
    sequences are output and MAST curves can be computed.

    10000 is a heuristic.

    '''

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.is_empty(dbfile):
        P.touch(outfile)
        return

    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.get_temp_dir(".")
    tmpfile = P.get_temp_filename(".")

    for motiffile in motiffiles:
        if IOTools.is_empty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

        of = IOTools.open_file(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run(statement)

    statement = "gzip < %(tmpfile)s > %(outfile)s"
    P.run(statement)

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
def buildNUMTs(infile, outfile):
    '''output set of potential nuclear mitochondrial genes (NUMTs).

    This function works by aligning the mitochondrial chromosome
    against genome using exonerate_. This can take a while.

    Arguments
    ---------
    infile : string
       Ignored.
    outfile : filename
       Output in :term:`gtf` format with potential NUMTs.

    '''
    if not PARAMS["numts_mitochrom"]:
        E.info("skipping numts creation")
        IOTools.touch_file(outfile)
        return

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    if PARAMS["numts_mitochrom"] not in fasta:
        E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"])
        IOTools.touch_file(outfile)
        return

    tmpfile_mito = P.get_temp_filename(".")

    statement = '''
    cgat index_fasta
           --extract=%(numts_mitochrom)s
           --log=%(outfile)s.log
           %(genome_dir)s/%(genome)s
    > %(tmpfile_mito)s
    '''

    P.run(statement)

    if IOTools.is_empty(tmpfile_mito):
        E.warn("mitochondrial genome empty.")
        os.unlink(tmpfile_mito)
        IOTools.touch_file(outfile)
        return

    format = ("qi", "qS", "qab", "qae", "ti", "tS", "tab", "tae", "s", "pi",
              "C")

    format = "\\\\t".join(["%%%s" % x for x in format])

    # collect all results
    min_score = 100

    statement = '''
    cat %(genome_dir)s/%(genome)s.fasta
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(tmpfile_mito)s
              --model affine:local
              --score %(min_score)i
              --showalignment no --showsugar no --showcigar no
              --showvulgar no
              --ryo \\"%(format)s\\n\\"
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run(statement)

    # convert to gtf
    inf = IOTools.open_file("%s.links.gz" % outfile)
    outf = IOTools.open_file(outfile, "w")

    min_score = PARAMS["numts_score"]

    c = E.Counter()

    for line in inf:
        (query_contig, query_strand, query_start, query_end, target_contig,
         target_strand, target_start, target_end, score, pid,
         alignment) = line[:-1].split("\t")

        c.input += 1
        score = int(score)
        if score < min_score:
            c.skipped += 1
            continue

        if target_strand == "-":
            target_start, target_end = target_end, target_start

        gff = GTF.Entry()
        gff.contig = target_contig
        gff.start, gff.end = int(target_start), int(target_end)
        assert gff.start < gff.end

        gff.strand = target_strand
        gff.score = int(score)
        gff.feature = "numts"
        gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        outf.write("%s\n" % str(gff))
        c.output += 1

    inf.close()
    outf.close()

    E.info("filtering numts: %s" % str(c))

    os.unlink(tmpfile_mito)
def buildPseudogenes(infiles, outfile, dbhandle):
    '''build a set of pseudogenes.

    Transcripts are extracted from the GTF file and designated as
    pseudogenes if:

    * the gene_type or transcript_type contains the phrase
      "pseudo". This taken is from the database.

    * the feature is 'processed_transcript' and has similarity to
      protein coding genes. Similarity is assessed by aligning the
      transcript and peptide set against each other with exonerate_.

    Pseudogenic transcripts can overlap with protein coding
    transcripts.

    Arguments
    ---------
    infiles : list
       Filenames of ENSEMBL geneset in :term:`gtf` format
       and associated peptide sequences in :term:`fasta` format.
    outfile : filename
       Output in :term:`gtf` format with inferred or annotated
       pseudogenes.
    dbandle : object
       Database handle for extracting transcript biotypes.
    '''

    infile_gtf, infile_peptides_fasta = infiles

    # JJ - there are also 'nontranslated_CDS', but no explanation of these
    if PARAMS["genome"].startswith("dm"):
        E.warn("Ensembl dm genome annotations only contain source"
               " 'pseudogenes' - skipping exonerate step")
        statement = """zcat %(infile_gtf)s
        |awk '$2 ~ /pseudogene/'
        | gzip
        > %(outfile)s"""
        P.run(statement)
        return

    tmpfile1 = P.get_temp_filename(shared=True)

    # collect processed transcripts and save as fasta sequences
    statement = '''
    zcat %(infile_gtf)s
    | awk '$2 ~ /processed/'
    | cgat gff2fasta
            --is-gtf
            --genome-file=%(genome_dir)s/%(genome)s
            --log=%(outfile)s.log
    > %(tmpfile1)s
    '''

    P.run(statement)

    if IOTools.is_empty(tmpfile1):
        E.warn("no pseudogenes found")
        os.unlink(tmpfile1)
        IOTools.touch_file(outfile)
        return

    model = "protein2dna"

    # map processed transcripts against peptide sequences
    statement = '''
    cat %(tmpfile1)s
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(infile_peptides_fasta)s
              --model %(model)s
              --bestn 1
              --score 200
              --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\"
              --showalignment no --showsugar no --showcigar no --showvulgar no
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run(statement)

    os.unlink(tmpfile1)

    inf = IOTools.open_file("%s.links.gz" % outfile)
    best_matches = {}
    for line in inf:
        peptide_id, transcript_id, score = line[:-1].split("\t")
        score = int(score)
        if transcript_id in best_matches and \
           best_matches[transcript_id][0] > score:
            continue
        best_matches[transcript_id] = (score, peptide_id)

    inf.close()

    E.info("found %i best links" % len(best_matches))
    new_pseudos = set(best_matches.keys())

    cc = dbhandle.cursor()
    known_pseudos = set([
        x[0] for x in cc.execute("""SELECT DISTINCT transcript_id
        FROM transcript_info
        WHERE transcript_biotype like '%pseudo%' OR
        gene_biotype like '%pseudo%' """)
    ])

    E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, "
           "intersection=%i" %
           ((len(new_pseudos), len(known_pseudos),
             len(new_pseudos.intersection(known_pseudos)))))

    all_pseudos = new_pseudos.union(known_pseudos)

    c = E.Counter()

    outf = IOTools.open_file(outfile, "w")
    inf = GTF.iterator(IOTools.open_file(infile_gtf))
    for gtf in inf:
        c.input += 1
        if gtf.transcript_id not in all_pseudos:
            continue
        c.output += 1
        outf.write("%s\n" % gtf)
    outf.close()

    E.info("exons: %s" % str(c))
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--methods",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("filter", "keep-first-base", "set-nh",
                               "set-sequence", "strip-sequence",
                               "strip-quality", "unstrip",
                               "unset-unmapped-mapq", "downsample-single",
                               "downsample-paired", "add-sequence-error"),
                      help="methods to apply [%default]")

    parser.add_option("--strip-method",
                      dest="strip_method",
                      type="choice",
                      choices=("all", "match"),
                      help="define which sequences/qualities to strip. "
                      "match means that stripping only applies to entries "
                      "without mismatches (requires NM tag to be present). "
                      "[%default]")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      action="append",
                      type="choice",
                      choices=('NM', 'CM', "mapped", "unique", "non-unique",
                               "remove-list", "keep-list", "error-rate",
                               "min-read-length", "min-average-base-quality"),
                      help="filter method to apply to remove alignments "
                      "from a bam file. Multiple methods can be supplied "
                      "[%default]")

    parser.add_option("--reference-bam-file",
                      dest="reference_bam",
                      type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force processing. Some methods such "
                      "as strip/unstrip will stop processing if "
                      "they think it not necessary "
                      "[%default]")

    parser.add_option("--output-sam",
                      dest="output_sam",
                      action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--first-fastq-file",
                      "-1",
                      dest="fastq_pair1",
                      type="string",
                      help="fastq file with read information for first "
                      "in pair or unpaired. Used for unstripping sequence "
                      "and quality scores [%default]")

    parser.add_option("--second-fastq-file",
                      "-2",
                      dest="fastq_pair2",
                      type="string",
                      help="fastq file with read information for second "
                      "in pair. Used for unstripping sequence "
                      "and quality scores  [%default]")

    parser.add_option("--downsample",
                      dest="downsample",
                      type="int",
                      help="Number of reads to downsample to")

    parser.add_option(
        "--filename-read-list",
        dest="filename_read_list",
        type="string",
        help=
        "Filename with list of reads to filter if 'keep-list' or 'remove-list' "
        "filter method is chosen [%default]")

    parser.add_option(
        "--error-rate",
        dest="error_rate",
        type="float",
        help="error rate to use as filter. Reads with an error rate "
        "higher than the threshold will be removed [%default]")

    parser.add_option("--minimum-read-length",
                      dest="minimum_read_length",
                      type="int",
                      help="minimum read length when filtering [%default]")

    parser.add_option(
        "--minimum-average-base-quality",
        dest="minimum_average_base_quality",
        type="float",
        help="minimum average base quality when filtering [%default]")

    parser.set_defaults(
        methods=[],
        output_sam=False,
        reference_bam=None,
        filter_methods=[],
        strip_method="all",
        force=False,
        fastq_pair1=None,
        fastq_pair2=None,
        downsample=None,
        random_seed=None,
        filename_read_list=None,
        error_rate=None,
        minimum_read_length=0,
        minimum_average_base_quality=0,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.stdin != sys.stdin:
        bamfile = options.stdin.name

    if "remove-list" in options.filter_methods or "keep-list" in options.filter_methods:
        if "remove-list" in options.filter_methods and "keep-list" in options.filter_methods:
            raise ValueError(
                "it is not possible to specify remove-list and keep-list")

        with IOTools.open_file(options.filename_read_list) as inf:
            filter_query_names = set(
                [x.strip() for x in inf.readlines() if not x.startswith("#")])
        E.info("read query_sequence filter list with {} read names".format(
            len(filter_query_names)))

    if "error-rate" in options.filter_methods and not options.error_rate:
        raise ValueError(
            "filtering by error-rate requires --error-rate to be set")

    if "add-sequence-error" in options.methods and not options.error_rate:
        raise ValueError("--add-error-rate requires --error-rate to be set")

    E.info('processing %s' % bamfile)
    if IOTools.is_empty(bamfile):
        E.warn('ignoring empty file %s' % bamfile)
        E.stop()
        return

    # reading bam from stdin does not work with only the "r" tag
    pysam_in = pysam.AlignmentFile(bamfile, "rb")

    if options.stdout != sys.stdout:
        output_bamfile = options.stdout.name
    else:
        output_bamfile = "-"

    if options.output_sam:
        pysam_out = pysam.AlignmentFile(output_bamfile,
                                        "wh",
                                        template=pysam_in)
    else:
        pysam_out = pysam.AlignmentFile(output_bamfile,
                                        "wb",
                                        template=pysam_in)

    if "filter" in options.methods:
        if "remove-list" in options.filter_methods or "keep-list" in options.filter_methods:

            it = pysam_in.fetch(until_eof=True)
            c = E.Counter()
            if "remove-list" in options.filter_methods:
                for read in it:
                    c.input += 1
                    if read.query_name in filter_query_names:
                        c.skipped += 1
                        continue
                    pysam_out.write(read)
                    c.output += 1
            elif "keep-list" in options.filter_methods:
                for read in it:
                    c.input += 1
                    if read.query_name not in filter_query_names:
                        c.skipped += 1
                        continue
                    pysam_out.write(read)
                    c.output += 1

            E.info("category\tcounts\n%s\n" % c.asTable())
        else:
            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter_methods:
                remove_mismatches = True

            elif "CM" in options.filter_methods:
                remove_mismatches = True
                colour_mismatches = True

            if "min-length" in options.filter_methods and options.minimum_read_length == 0:
                raise ValueError(
                    "please specify --minimum-read-length when using "
                    "--filter-method=min-read-length")

            if "min-average-base-quality" in options.filter_methods and options.minimum_average_base_quality == 0:
                raise ValueError(
                    "please specify --min-average-base-quality when "
                    "using --filter-method=min-average-base-quality")

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.AlignmentFile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = bam2bam_filter_bam(
                pysam_in,
                pysam_out,
                pysam_ref,
                remove_nonunique="unique" in options.filter_methods,
                remove_unique="non-unique" in options.filter_methods,
                remove_contigs=None,
                remove_unmapped="mapped" in options.filter_methods,
                remove_mismatches=remove_mismatches,
                filter_error_rate=options.error_rate,
                colour_mismatches=colour_mismatches,
                minimum_read_length=options.minimum_read_length,
                minimum_average_base_quality=options.
                minimum_average_base_quality)

            options.stdlog.write("category\tcounts\n%s\n" % c.asTable())
    else:

        # set up the modifying iterators
        it = pysam_in.fetch(until_eof=True)

        def nop(x):
            return None

        # function to check if processing should start
        pre_check_f = nop

        if "unset-unmapped-mapq" in options.methods:

            def unset_unmapped_mapq(i):
                for read in i:
                    if read.is_unmapped:
                        read.mapq = 0
                    yield read

            it = unset_unmapped_mapq(it)

        if "set-sequence" in options.methods:

            def set_sequence(i):
                for read in i:
                    # can't get at length of unmapped reads
                    if read.is_unmapped:
                        read.seq = "A"
                        read.qual = "F"
                    else:
                        read.seq = "A" * read.inferred_length
                        read.qual = "F" * read.inferred_length

                    yield read

            it = set_sequence(it)

        if "strip-sequence" in options.methods or "strip-quality" in \
           options.methods:

            def strip_sequence(i):
                for read in i:
                    read.seq = None
                    yield read

            def check_sequence(reads):
                if reads[0].seq is None:
                    return 'no sequence present'
                return None

            def strip_quality(i):
                for read in i:
                    read.qual = None
                    yield read

            def check_quality(reads):
                if reads[0].qual is None:
                    return 'no quality information present'
                return None

            def strip_match(i):
                for read in i:
                    try:
                        nm = read.opt('NM')
                    except KeyError:
                        nm = 1
                    if nm == 0:
                        read.seq = None
                    yield read

            if options.strip_method == "all":
                if "strip-sequence" in options.methods:
                    it = strip_sequence(it)
                    pre_check_f = check_sequence
                elif "strip-quality" in options.methods:
                    it = strip_quality(it)
                    pre_check_f = check_quality
            elif options.strip_method == "match":
                it = strip_match(it)

        if "unstrip" in options.methods:

            def buildReadDictionary(filename):
                if not os.path.exists(filename):
                    raise OSError("file not found: %s" % filename)
                fastqfile = pysam.FastxFile(filename)
                fastq2sequence = {}
                for x in fastqfile:
                    if x.name in fastq2sequence:
                        raise ValueError(
                            "read %s duplicate - can not unstrip" % x.name)

                    fastq2sequence[x.name] = (x.sequence, x.quality)
                return fastq2sequence

            if not options.fastq_pair1:
                raise ValueError("please supply fastq file(s) for unstripping")
            fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
            if options.fastq_pair2:
                fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

            def unstrip_unpaired(i):
                for read in i:
                    read.seq, read.qual = fastq2sequence1[read.qname]
                    yield read

            def unstrip_pair(i):
                for read in i:
                    if read.is_read1:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                    else:
                        read.seq, read.qual = fastq2sequence2[read.qname]
                    yield read

            if options.fastq_pair2:
                it = unstrip_pair(it)
            else:
                it = unstrip_unpaired(it)

        if "set-nh" in options.methods:
            it = SetNH(it)

        # keep first base of reads by changing the cigarstring to
        # '1M' and, in reads mapping to the reverse strand,
        # changes the pos to aend - 1
        # Needs to be refactored to make it more general
        # (last base, midpoint, ..)
        if "keep_first_base" in options.methods:

            def keep_first_base(i):
                for read in i:
                    if read.is_reverse:
                        read.pos = read.aend - 1
                        read.cigarstring = '1M'
                    elif not read.is_unmapped:
                        read.cigarstring = '1M'
                    yield read

            it = keep_first_base(it)

        # read first read and check if processing should continue
        # only possible when not working from stdin
        # Refactoring: use cache to also do a pre-check for
        # stdin input.
        if bamfile != "-":
            # get first read for checking pre-conditions
            first_reads = list(pysam_in.head(1))

            msg = pre_check_f(first_reads)
            if msg is not None:
                if options.force:
                    E.warn('proccessing continues, though: %s' % msg)
                else:
                    E.warn('processing not started: %s' % msg)
                    pysam_in.close()
                    pysam_out.close()
                    E.stop()
                    return

        if "downsample-single" in options.methods:

            if not options.downsample:
                raise ValueError("Please provide downsample size")

            else:
                down = SubsetBam(pysam_in=it,
                                 downsample=options.downsample,
                                 paired_end=None,
                                 single_end=True,
                                 random_seed=options.random_seed)
                it = down.downsample_single()

        if "downsample-paired" in options.methods:

            if not options.downsample:
                raise ValueError("Please provide downsample size")

            else:
                down = SubsetBam(pysam_in=it,
                                 downsample=options.downsample,
                                 paired_end=True,
                                 single_end=None,
                                 random_seed=options.random_seed)
                it = down.downsample_paired()

        if "add-sequence-error" in options.methods:

            def add_sequence_error(i):
                error_rate = options.error_rate
                map_nuc2var = {"A": "CGT", "C": "AGT", "G": "ACT", "T": "ACG"}
                for read in i:
                    sequence = list(read.query_sequence)
                    quals = read.query_qualities
                    npos = int(math.floor(len(sequence) * error_rate))
                    positions = random.sample(range(len(sequence)), npos)
                    for pos in positions:
                        try:
                            alt = map_nuc2var[sequence[pos]]
                        except KeyError:
                            continue
                        sequence[pos] = alt[random.randint(0, len(alt) - 1)]

                    read.query_sequence = "".join(sequence)
                    read.query_qualities = quals
                    yield read

            it = add_sequence_error(it)

        # continue processing till end
        for read in it:
            pysam_out.write(read)

        pysam_in.close()
        pysam_out.close()

    # write footer and output benchmark information.
    E.stop()
def BedFileVenn(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''
    bed1, bed2 = infiles
    liver_name = P.snip(os.path.basename(liver), ".replicated.bed")
    testes_name = P.snip(os.path.basename(testes), ".replicated.bed")
    to_cluster = True

    statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed;
                   echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; 
                   echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; 
                   echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; 
                   echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s;                   
                   sed -i '{N;s/\\n/\\t/g}' %(outfile)s; '''

    if len(infiles) == 1:
        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.is_empty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            IOTools.touch_file(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run(statement)

    else:

        tmpfile = P.get_temp_filename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.is_empty(infiles[0]):
            IOTools.touch_file(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run(statement)

        for fn in infiles[1:]:
            if IOTools.is_empty(infiles[0]):
                IOTools.touch_file(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run(statement)

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %(outfile)s '''
        P.run(statement)

        os.unlink(tmpfile)
Exemple #10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--methods",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("filter", "keep-first-base", "set-nh",
                               "set-sequence", "strip-sequence",
                               "strip-quality", "unstrip",
                               "unset-unmapped-mapq", "downsample-single",
                               "downsample-paired"),
                      help="methods to apply [%default]")

    parser.add_option("--strip-method",
                      dest="strip_method",
                      type="choice",
                      choices=("all", "match"),
                      help="define which sequences/qualities to strip. "
                      "match means that stripping only applies to entries "
                      "without mismatches (requires NM tag to be present). "
                      "[%default]")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      action="append",
                      type="choice",
                      choices=('NM', 'CM', 'mapped', 'unique', "non-unique"),
                      help="filter method to apply to remove alignments "
                      "from a bam file. Multiple methods can be supplied "
                      "[%default]")

    parser.add_option("--reference-bam-file",
                      dest="reference_bam",
                      type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force processing. Some methods such "
                      "as strip/unstrip will stop processing if "
                      "they think it not necessary "
                      "[%default]")

    parser.add_option("--output-sam",
                      dest="output_sam",
                      action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--inplace",
                      dest="inplace",
                      action="store_true",
                      help="modify bam files in-place. Bam files need "
                      "to be given "
                      "as arguments. Temporary bam files are written "
                      "to /tmp [%default]")

    parser.add_option("--first-fastq-file",
                      "-1",
                      dest="fastq_pair1",
                      type="string",
                      help="fastq file with read information for first "
                      "in pair or unpaired. Used for unstripping sequence "
                      "and quality scores [%default]")

    parser.add_option("--second-fastq-file",
                      "-2",
                      dest="fastq_pair2",
                      type="string",
                      help="fastq file with read information for second "
                      "in pair. Used for unstripping sequence "
                      "and quality scores  [%default]")

    parser.add_option("--downsample",
                      dest="downsample",
                      type="int",
                      help="Number of reads to downsample to")

    parser.set_defaults(methods=[],
                        output_sam=False,
                        reference_bam=None,
                        filter_methods=[],
                        strip_method="all",
                        force=False,
                        inplace=False,
                        fastq_pair1=None,
                        fastq_pair2=None,
                        downsample=None,
                        random_seed=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)
    # random.seed(options.random_seed)
    bamfiles = []

    if options.stdin != sys.stdin:
        from_stdin = True
        bamfiles.append(options.stdin.name)
    else:
        from_stdin = False

    if options.inplace:
        bamfiles.extend(args)
        if len(bamfiles) == 0:
            raise ValueError(
                "please one or more bam-files as command line arguments")

        if "-" in bamfiles:
            raise ValueError(
                "can not read from stdin if ``--inplace`` is selected")

    if len(bamfiles) == 0:
        bamfiles = ["-"]

    to_stdout = False

    for bamfile in bamfiles:

        E.info('processing %s' % bamfile)

        if os.path.islink(bamfile):
            E.warn('ignoring link %s' % bamfile)
            continue

        if IOTools.is_empty(bamfile):
            E.warn('ignoring empty file %s' % bamfile)
            continue

        # reading bam from stdin does not work with only the "r" tag
        pysam_in = pysam.AlignmentFile(bamfile, "rb")
        if bamfile == "-" or (from_stdin and bamfile == options.stdin.name):
            to_stdout = True
            if options.output_sam:
                pysam_out = pysam.AlignmentFile("-", "wh", template=pysam_in)
            else:
                pysam_out = pysam.AlignmentFile("-", "wb", template=pysam_in)
        else:
            if IOTools.is_empty(bamfile):
                E.warn('skipping empty file %s' % bamfile)
                continue
            tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp")
            tmpfile.close()

            E.debug("writing temporary bam-file to %s" % tmpfile.name)
            pysam_out = pysam.AlignmentFile(tmpfile.name,
                                            "wb",
                                            template=pysam_in)

        if "filter" in options.methods:

            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter_methods:
                remove_mismatches = True

            elif "CM" in options.filter_methods:
                remove_mismatches = True
                colour_mismatches = True

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.AlignmentFile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = _bam2bam.filter_bam(pysam_in,
                                    pysam_out,
                                    pysam_ref,
                                    remove_nonunique="unique"
                                    in options.filter_methods,
                                    remove_unique="non-unique"
                                    in options.filter_methods,
                                    remove_contigs=None,
                                    remove_unmapped="mapped"
                                    in options.filter_methods,
                                    remove_mismatches=remove_mismatches,
                                    colour_mismatches=colour_mismatches)

            if pysam_ref:
                pysam_ref.close()

            # do not write to stdlog in the middle of a SAM/BAM stdout stream.
            if options.stdlog != options.stdout:
                E.info("category\tcounts\n%s\n" % c.asTable())
        else:

            # set up the modifying iterators
            it = pysam_in.fetch(until_eof=True)

            # function to check if processing should start
            pre_check_f = lambda x: None

            if "unset-unmapped-mapq" in options.methods:

                def unset_unmapped_mapq(i):
                    for read in i:
                        if read.is_unmapped:
                            read.mapq = 0
                        yield read

                it = unset_unmapped_mapq(it)

            if "set-sequence" in options.methods:

                def set_sequence(i):
                    for read in i:
                        # can't get at length of unmapped reads
                        if read.is_unmapped:
                            read.seq = "A"
                            read.qual = "F"
                        else:
                            read.seq = "A" * read.inferred_length
                            read.qual = "F" * read.inferred_length

                        yield read

                it = set_sequence(it)

            if "strip-sequence" in options.methods or "strip-quality" in \
               options.methods:

                def strip_sequence(i):
                    for read in i:
                        read.seq = None
                        yield read

                def check_sequence(reads):
                    if reads[0].seq is None:
                        return 'no sequence present'
                    return None

                def strip_quality(i):
                    for read in i:
                        read.qual = None
                        yield read

                def check_quality(reads):
                    if reads[0].qual is None:
                        return 'no quality information present'
                    return None

                def strip_match(i):
                    for read in i:
                        try:
                            nm = read.opt('NM')
                        except KeyError:
                            nm = 1
                        if nm == 0:
                            read.seq = None
                        yield read

                if options.strip_method == "all":
                    if "strip-sequence" in options.methods:
                        it = strip_sequence(it)
                        pre_check_f = check_sequence
                    elif "strip-quality" in options.methods:
                        it = strip_quality(it)
                        pre_check_f = check_quality
                elif options.strip_method == "match":
                    it = strip_match(it)

            if "unstrip" in options.methods:

                def buildReadDictionary(filename):
                    if not os.path.exists(filename):
                        raise OSError("file not found: %s" % filename)
                    fastqfile = pysam.FastxFile(filename)
                    fastq2sequence = {}
                    for x in fastqfile:
                        if x.name in fastq2sequence:
                            raise ValueError(
                                "read %s duplicate - can not unstrip" % x.name)

                        fastq2sequence[x.name] = (x.sequence, x.quality)
                    return fastq2sequence

                if not options.fastq_pair1:
                    raise ValueError(
                        "please supply fastq file(s) for unstripping")
                fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
                if options.fastq_pair2:
                    fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

                def unstrip_unpaired(i):
                    for read in i:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                        yield read

                def unstrip_pair(i):
                    for read in i:
                        if read.is_read1:
                            read.seq, read.qual = fastq2sequence1[read.qname]
                        else:
                            read.seq, read.qual = fastq2sequence2[read.qname]
                        yield read

                if options.fastq_pair2:
                    it = unstrip_pair(it)
                else:
                    it = unstrip_unpaired(it)

            if "set-nh" in options.methods:
                it = SetNH(it)

            # keep first base of reads by changing the cigarstring to
            # '1M' and, in reads mapping to the reverse strand,
            # changes the pos to aend - 1
            # Needs to be refactored to make it more general
            # (last base, midpoint, ..)
            if "keep_first_base" in options.methods:

                def keep_first_base(i):
                    for read in i:
                        if read.is_reverse:
                            read.pos = read.aend - 1
                            read.cigarstring = '1M'
                        elif not read.is_unmapped:
                            read.cigarstring = '1M'
                        yield read

                it = keep_first_base(it)

            # read first read and check if processing should continue
            # only possible when not working from stdin
            # Refactoring: use cache to also do a pre-check for
            # stdin input.
            if bamfile != "-":
                # get first read for checking pre-conditions
                first_reads = list(pysam_in.head(1))

                msg = pre_check_f(first_reads)
                if msg is not None:
                    if options.force:
                        E.warn('proccessing continues, though: %s' % msg)
                    else:
                        E.warn('processing not started: %s' % msg)
                        pysam_in.close()
                        pysam_out.close()
                        continue

            if "downsample-single" in options.methods:

                if not options.downsample:
                    raise ValueError("Please provide downsample size")

                else:
                    down = SubsetBam(pysam_in=it,
                                     downsample=options.downsample,
                                     paired_end=None,
                                     single_end=True,
                                     random_seed=options.random_seed)
                    it = down.downsample_single()

            if "downsample-paired" in options.methods:

                if not options.downsample:
                    raise ValueError("Please provide downsample size")

                else:
                    down = SubsetBam(pysam_in=it,
                                     downsample=options.downsample,
                                     paired_end=True,
                                     single_end=None,
                                     random_seed=options.random_seed)
                    it = down.downsample_paired()

            # continue processing till end
            for read in it:
                pysam_out.write(read)

        pysam_in.close()
        pysam_out.close()

        if options.inplace:
            # set date and file permissions according to original
            # Note: currently it will not update user and group.
            original = os.stat(bamfile)
            os.utime(tmpfile.name, (original.st_atime, original.st_mtime))
            os.chmod(tmpfile.name, original.st_mode)
            # move new file over original copy
            shutil.move(tmpfile.name, bamfile)
            # re-index
            pysam.index(bamfile)

    # write footer and output benchmark information.
    E.stop()