Example #1
0
def intersectBedFiles(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''

    if len(infiles) == 1:

        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if IOTools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)
def intersectBedFiles(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''

    if len(infiles) == 1:

        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if IOTools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)
Example #3
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Example #4
0
def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom",
                               outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
Example #5
0
def subtractBedFiles(infile, subtractfile, outfile):
    '''subtract intervals in *subtractfile* from *infile*
    and store in *outfile*.
    '''

    if IOTools.isEmpty(subtractfile):
        shutil.copyfile(infile, outfile)
        return
    elif IOTools.isEmpty(infile):
        P.touch(outfile)
        return

    statement = '''
        intersectBed -v -a %(infile)s -b %(subtractfile)s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %(outfile)s ; tabix -p bed %(outfile)s
        '''

    P.run()
def subtractBedFiles(infile, subtractfile, outfile):
    '''subtract intervals in *subtractfile* from *infile*
    and store in *outfile*.
    '''

    if IOTools.isEmpty(subtractfile):
        shutil.copyfile(infile, outfile)
        return
    elif IOTools.isEmpty(infile):
        P.touch(outfile)
        return

    statement = '''
        intersectBed -v -a %(infile)s -b %(subtractfile)s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %(outfile)s ; tabix -p bed %(outfile)s
        '''

    P.run()
Example #7
0
def loadMemeSummary( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("track\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        motif = P.snip( infile, ".meme" )
        outf.write( "%s\n" % motif )

    outf.close()

    P.load( outf.name, outfile )
    
    os.unlink( outf.name )
Example #8
0
def loadMotifInformation( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("motif\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        motif = P.snip( infile, ".motif" )
        outf.write( "%s\n" % motif )

    outf.close()

    P.load( outf.name, outfile, "--allow-empty" )
    
    os.unlink( outf.name )
Example #9
0
def loadMemeChipSummary( infiles, outfile ):
    '''load information about motifs into database.'''
    
    outf = P.getTempFile(".")

    outf.write("track\tnpeaks\twidth\tmasking\tpath\n" )

    for infile in infiles:
        if IOTools.isEmpty( infile ): continue
        fn = P.snip(os.path.basename( infile ), ".memechip" )
        
        track, npeaks, width, masking = fn.split(".")
        outf.write( "\t".join( map(str,(track, npeaks, width, masking, fn)) ) + "\n" )

    outf.close()

    P.load( outf.name, outfile )
    
    os.unlink( outf.name )
Example #10
0
def loadMemeSummary(infiles, outfile):
    '''load information about motifs into database.'''

    outf = P.getTempFile(".")

    outf.write("method\ttrack\n")

    for infile in infiles:
        if IOTools.isEmpty(infile):
            continue
        method = re.match("(.+).dir/", infile).groups()[0]
        track = os.path.basename(".".join(infile.split(".")[:-1]))
        outf.write("%s\t%s\n" % (method, track))

    outf.close()

    P.load(outf.name, outfile)

    os.unlink(outf.name)
def buildPseudogenes(infiles, outfile, dbhandle):
    '''build a set of pseudogenes.

    Transcripts are extracted from the GTF file and designated as
    pseudogenes if:

    * the gene_type or transcript_type contains the phrase
      "pseudo". This taken is from the database.

    * the feature is 'processed_transcript' and has similarity to
      protein coding genes. Similarity is assessed by aligning the
      transcript and peptide set against each other with exonerate_.

    Pseudogenic transcripts can overlap with protein coding
    transcripts.

    Arguments
    ---------
    infiles : list
       Filenames of ENSEMBL geneset in :term:`gtf` format
       and associated peptide sequences in :term:`fasta` format.
    outfile : filename
       Output in :term:`gtf` format with inferred or annotated
       pseudogenes.
    dbandle : object
       Database handle for extracting transcript biotypes.
    '''

    infile_gtf, infile_peptides_fasta = infiles

    # JJ - there are also 'nontranslated_CDS', but no explanation of these
    if PARAMS["genome"].startswith("dm"):
        E.warn("Ensembl dm genome annotations only contain source"
               " 'pseudogenes' - skipping exonerate step")
        statement = """zcat %(infile_gtf)s
        |awk '$2 ~ /pseudogene/'
        | gzip
        > %(outfile)s"""
        P.run()
        return

    tmpfile1 = P.getTempFilename(shared=True)

    # collect processed transcripts and save as fasta sequences
    statement = '''
    zcat %(infile_gtf)s
    | awk '$2 ~ /processed/'
    | python %(scriptsdir)s/gff2fasta.py
            --is-gtf
            --genome-file=%(genome_dir)s/%(genome)s
            --log=%(outfile)s.log
    > %(tmpfile1)s
    '''

    P.run()

    if IOTools.isEmpty(tmpfile1):
        E.warn("no pseudogenes found")
        os.unlink(tmpfile1)
        P.touch(outfile)
        return

    model = "protein2dna"

    # map processed transcripts against peptide sequences
    statement = '''
    cat %(tmpfile1)s
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(infile_peptides_fasta)s
              --model %(model)s
              --bestn 1
              --score 200
              --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\"
              --showalignment no --showsugar no --showcigar no --showvulgar no
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run()

    os.unlink(tmpfile1)

    inf = IOTools.openFile("%s.links.gz" % outfile)
    best_matches = {}
    for line in inf:
        peptide_id, transcript_id, score = line[:-1].split("\t")
        score = int(score)
        if transcript_id in best_matches and \
           best_matches[transcript_id][0] > score:
            continue
        best_matches[transcript_id] = (score, peptide_id)

    inf.close()

    E.info("found %i best links" % len(best_matches))
    new_pseudos = set(best_matches.keys())

    cc = dbhandle.cursor()
    known_pseudos = set([
        x[0] for x in cc.execute("""SELECT DISTINCT transcript_id
        FROM transcript_info
        WHERE transcript_biotype like '%pseudo%' OR
        gene_biotype like '%pseudo%' """)
    ])

    E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, "
           "intersection=%i" %
           ((len(new_pseudos), len(known_pseudos),
             len(new_pseudos.intersection(known_pseudos)))))

    all_pseudos = new_pseudos.union(known_pseudos)

    c = E.Counter()

    outf = IOTools.openFile(outfile, "w")
    inf = GTF.iterator(IOTools.openFile(infile_gtf))
    for gtf in inf:
        c.input += 1
        if gtf.transcript_id not in all_pseudos:
            continue
        c.output += 1
        outf.write("%s\n" % gtf)
    outf.close()

    E.info("exons: %s" % str(c))
def buildNUMTs(infile, outfile):
    '''output set of potential nuclear mitochondrial genes (NUMTs).

    This function works by aligning the mitochondrial chromosome
    against genome using exonerate_. This can take a while.

    Arguments
    ---------
    infile : string
       Ignored.
    outfile : filename
       Output in :term:`gtf` format with potential NUMTs.

    '''
    if not PARAMS["numts_mitochrom"]:
        E.info("skipping numts creation")
        P.touch(outfile)
        return

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    if PARAMS["numts_mitochrom"] not in fasta:
        E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"])
        P.touch(outfile)
        return

    tmpfile_mito = P.getTempFilename(".")

    statement = '''
    python %(scriptsdir)s/index_fasta.py
           --extract=%(numts_mitochrom)s
           --log=%(outfile)s.log
           %(genome_dir)s/%(genome)s
    > %(tmpfile_mito)s
    '''

    P.run()

    if IOTools.isEmpty(tmpfile_mito):
        E.warn("mitochondrial genome empty.")
        os.unlink(tmpfile_mito)
        P.touch(outfile)
        return

    format = ("qi", "qS", "qab", "qae", "ti", "tS", "tab", "tae", "s", "pi",
              "C")

    format = "\\\\t".join(["%%%s" % x for x in format])

    # collect all results
    min_score = 100

    statement = '''
    cat %(genome_dir)s/%(genome)s.fasta
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(tmpfile_mito)s
              --model affine:local
              --score %(min_score)i
              --showalignment no --showsugar no --showcigar no
              --showvulgar no
              --ryo \\"%(format)s\\n\\"
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run()

    # convert to gtf
    inf = IOTools.openFile("%s.links.gz" % outfile)
    outf = IOTools.openFile(outfile, "w")

    min_score = PARAMS["numts_score"]

    c = E.Counter()

    for line in inf:
        (query_contig, query_strand, query_start, query_end, target_contig,
         target_strand, target_start, target_end, score, pid,
         alignment) = line[:-1].split("\t")

        c.input += 1
        score = int(score)
        if score < min_score:
            c.skipped += 1
            continue

        if target_strand == "-":
            target_start, target_end = target_end, target_start

        gff = GTF.Entry()
        gff.contig = target_contig
        gff.start, gff.end = int(target_start), int(target_end)
        assert gff.start < gff.end

        gff.strand = target_strand
        gff.score = int(score)
        gff.feature = "numts"
        gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        outf.write("%s\n" % str(gff))
        c.output += 1

    inf.close()
    outf.close()

    E.info("filtering numts: %s" % str(c))

    os.unlink(tmpfile_mito)
Example #13
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that all
    sequences are output and MAST curves can be computed.

    10000 is a heuristic.

    '''

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.isEmpty(dbfile):
        P.touch(outfile)
        return

    if not os.path.exists(controlfile):
        raise ValueError("control file %s for %s does not exist" %
                         (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.getTempDir(".")
    tmpfile = P.getTempFilename(".")

    for motiffile in motiffiles:
        if IOTools.isEmpty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

    statement = "gzip < %(tmpfile)s > %(outfile)s"
    P.run()

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
def BedFileVenn(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''
    bed1, bed2 = infiles
    liver_name = P.snip(os.path.basename(liver), ".replicated.bed")
    testes_name = P.snip(os.path.basename(testes), ".replicated.bed")
    to_cluster = True

    statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed;
                   echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; 
                   echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; 
                   echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; 
                   echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s;                   
                   sed -i '{N;s/\\n/\\t/g}' %(outfile)s; '''

    if len(infiles) == 1:
        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if IOTools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)
Example #15
0
def loadZinba(infile, outfile, bamfile, tablename=None, controlfile=None):
    '''load Zinba results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.zinba`.

    If no peaks were predicted, an empty table is created.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.

    This method uses the refined peak locations.

    Zinba peaks can be overlapping. This method does not merge
    overlapping intervals.

    Zinba calls peaks in regions where there are many reads inside
    the control. Thus this method applies a filtering step 
    removing all intervals in which there is a peak of
    more than readlength / 2 height in the control.

    .. note:

       Zinba calls peaks that are overlapping.

    '''

    track = P.snip(os.path.basename(infile), ".zinba")
    folder = os.path.dirname(infile)

    infilename = infile + ".peaks"

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig",
        "start",
        "end",
        "npeaks",
        "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue",
        "fold",
        "qvalue",
        "macs_summit",
        "macs_nprobes",
    )) + "\n")

    counter = E.Counter()

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
    elif IOTools.isEmpty(infilename):
        E.warn("no data in %s" % infilename)
    else:
        # filter peaks
        shift = getPeakShiftFromZinba(infile)
        assert shift is not None, \
            "could not determine peak shift from Zinba file %s" % infile

        E.info("%s: found peak shift of %i" % (track, shift))

        samfiles = [pysam.Samfile(bamfile, "rb")]
        offsets = [shift / 2]

        if controlfile:
            controlfiles = [pysam.Samfile(controlfile, "rb")]
            readlength = BamTools.estimateTagSize(controlfile)
            control_max_peakval = readlength // 2
            E.info(
                "removing intervals in which control has peak higher than %i reads"
                % control_max_peakval)
        else:
            controlfiles = None

        id = 0

        # get thresholds
        max_qvalue = float(PARAMS["zinba_fdr_threshold"])

        with IOTools.openFile(infilename, "r") as ins:
            for peak in WrapperZinba.iteratePeaks(ins):

                # filter by qvalue
                if peak.fdr > max_qvalue:
                    counter.removed_qvalue += 1
                    continue

                assert peak.refined_start < peak.refined_end

                # filter by control
                if controlfiles:
                    npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(
                        peak.contig, peak.refined_start, peak.refined_end,
                        controlfiles, offsets)

                    if peakval > control_max_peakval:
                        counter.removed_control += 1
                        continue

                # output peak
                npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(
                    peak.contig, peak.refined_start, peak.refined_end,
                    samfiles, offsets)

                outtemp.write("\t".join(
                    map(str, (id, peak.contig, peak.refined_start,
                              peak.refined_end, npeaks, peakcenter, length,
                              avgval, peakval, nreads, 1.0 - peak.posterior,
                              1.0, peak.fdr, peak.refined_start + peak.summit -
                              1, peak.height))) + "\n")
                id += 1
                counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = IOTools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_intervals" % track

    statement = '''
    python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --allow-empty-file
              --add-index=interval_id 
              --add-index=contig,start
              --table=%(tablename)s 
    < %(tmpfilename)s 
    > %(outfile)s
    '''

    P.run()

    os.unlink(tmpfilename)
Example #16
0
def buildNUMTs(infile, outfile):
    '''output set of potential nuclear mitochondrial genes (NUMTs).

    This function works by aligning the mitochondrial chromosome
    against genome using exonerate_. This can take a while.

    Arguments
    ---------
    infile : string
       Ignored.
    outfile : filename
       Output in :term:`gtf` format with potential NUMTs.

    '''
    if not PARAMS["numts_mitochrom"]:
        E.info("skipping numts creation")
        P.touch(outfile)
        return

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    if PARAMS["numts_mitochrom"] not in fasta:
        E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"])
        P.touch(outfile)
        return

    tmpfile_mito = P.getTempFilename(".")

    statement = '''
    python %(scriptsdir)s/index_fasta.py
           --extract=%(numts_mitochrom)s
           --log=%(outfile)s.log
           %(genome_dir)s/%(genome)s
    > %(tmpfile_mito)s
    '''

    P.run()

    if IOTools.isEmpty(tmpfile_mito):
        E.warn("mitochondrial genome empty.")
        os.unlink(tmpfile_mito)
        P.touch(outfile)
        return

    format = ("qi", "qS", "qab", "qae",
              "ti", "tS", "tab", "tae",
              "s",
              "pi",
              "C")

    format = "\\\\t".join(["%%%s" % x for x in format])

    # collect all results
    min_score = 100

    statement = '''
    cat %(genome_dir)s/%(genome)s.fasta
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(tmpfile_mito)s
              --model affine:local
              --score %(min_score)i
              --showalignment no --showsugar no --showcigar no
              --showvulgar no
              --ryo \\"%(format)s\\n\\"
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run()

    # convert to gtf
    inf = IOTools.openFile("%s.links.gz" % outfile)
    outf = IOTools.openFile(outfile, "w")

    min_score = PARAMS["numts_score"]

    c = E.Counter()

    for line in inf:
        (query_contig, query_strand, query_start, query_end,
         target_contig, target_strand, target_start, target_end,
         score, pid, alignment) = line[:-1].split("\t")

        c.input += 1
        score = int(score)
        if score < min_score:
            c.skipped += 1
            continue

        if target_strand == "-":
            target_start, target_end = target_end, target_start

        gff = GTF.Entry()
        gff.contig = target_contig
        gff.start, gff.end = int(target_start), int(target_end)
        assert gff.start < gff.end

        gff.strand = target_strand
        gff.score = int(score)
        gff.feature = "numts"
        gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end)
        outf.write("%s\n" % str(gff))
        c.output += 1

    inf.close()
    outf.close()

    E.info("filtering numts: %s" % str(c))

    os.unlink(tmpfile_mito)
Example #17
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--set-nh", dest="set_nh", action="store_true",
                      help="sets the NH flag. The file needs to be "
                      "sorted by readname [%default]")

    parser.add_option("--unset-unmapped-mapq", dest="unset_unmapped_mapq",
                      action="store_true",
                      help="sets the mapping quality of unmapped "
                      "reads to 0 [%default]")

    parser.add_option("--set-sequence", dest="set_sequence",
                      action="store_true",
                      help="sets the sequence to 'A's (a valid base) and "
                      "the quality to 'F's "
                      ",which is defined in all fastq scoring schemes "
                      "[%default]")

    parser.add_option("--strip", dest="strip", type="choice",
                      choices=("sequence", "quality", "match"),
                      help = "remove parts of the bam-file. Note that "
                      "stripping the sequence will "
                      "also strip the quality values [%default]")

    parser.add_option("--unstrip", dest="unstrip", action="store_true",
                      help="add sequence and quality into bam file [%default]")

    parser.add_option("--filter", dest="filter",
                      action="append", type="choice",
                      choices=('NM', 'CM', 'mapped', 'unique', "non-unique"),
                      help = "filter bam file. The option denotes "
                      "the property that is  "
                      "used to determine better match [%default]")

    parser.add_option("--reference-bam", dest="reference_bam", type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--force", dest="force", action="store_true",
                      help="force processing. Some methods such "
                      "as strip/unstrip will stop processing if "
                      "they think it not necessary "
                      "[%default]")

    parser.add_option("--sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--inplace", dest="inplace", action="store_true",
                      help="modify bam files in-place. Bam files need "
                      "to be given "
                      "as arguments. Temporary bam files are written "
                      "to /tmp [%default]")

    parser.add_option("--fastq1", "-1", dest="fastq_pair1", type="string",
                      help="fastq file with read information for first "
                      "in pair or unpaired [%default]")

    parser.add_option("--fastq2", "-2", dest="fastq_pair2", type="string",
                      help="fastq file with read information for second "
                      "in pair [%default]")

    parser.add_option("--keep-first-base", dest="keep_first_base",
                      action="store_true",
                      help="keep first base of reads such that gtf2table.py "
                      "will only consider the "
                      "first base in its counts.")

    parser.set_defaults(
        filter=[],
        set_nh=False,
        unset_unmapped_mapq=False,
        output_sam=False,
        reference_bam=None,
        strip=None,
        unstrip=None,
        force=False,
        set_sequence=False,
        inplace=False,
        fastq_pair1=None,
        fastq_pair2=None,
        keep_first_base=False
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    bamfiles = []

    if options.stdin != sys.stdin:
        bamfiles.append(options.stdin.name)

    if options.inplace:
        bamfiles.extend(args)
        if len(bamfiles) == 0:
            raise ValueError(
                "please one or more bam-files as command line arguments")

        if "-" in bamfiles:
            raise ValueError(
                "can not read from stdin if ``--inplace`` is selected")

    if len(bamfiles) == 0:
        bamfiles = ["-"]

    for bamfile in bamfiles:

        E.info('processing %s' % bamfile)

        if os.path.islink(bamfile):
            E.warn('ignoring link %s' % bamfile)
            continue

        if IOTools.isEmpty(bamfile):
            E.warn('ignoring empty file %s' % bamfile)
            continue

        # reading bam from stdin does not work with only the "r" tag
        pysam_in = pysam.Samfile(bamfile, "rb")

        if bamfile == "-":
            if options.output_sam:
                pysam_out = pysam.Samfile("-", "wh", template=pysam_in)
            else:
                pysam_out = pysam.Samfile("-", "wb", template=pysam_in)
        else:
            if IOTools.isEmpty(bamfile):
                E.warn('skipping empty file %s' % bamfile)
                continue
            tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp")
            tmpfile.close()

            E.debug("writing temporary bam-file to %s" % tmpfile.name)
            pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in)

        if options.filter:

            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter:
                remove_mismatches = True

            elif "CM" in options.filter:
                remove_mismatches = True
                colour_mismatches = True

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.Samfile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = _bam2bam.filter_bam(
                pysam_in, pysam_out, pysam_ref,
                remove_nonunique="unique" in options.filter,
                remove_unique="non-unique" in options.filter,
                remove_contigs=None,
                remove_unmapped="mapped" in options.filter,
                remove_mismatches=remove_mismatches,
                colour_mismatches=colour_mismatches)

            options.stdlog.write("category\tcounts\n%s\n" % c.asTable())
        else:

            # set up the modifying iterators
            it = pysam_in.fetch(until_eof=True)

            # function to check if processing should start
            pre_check_f = lambda x: None

            if options.unset_unmapped_mapq:
                def unset_unmapped_mapq(i):
                    for read in i:
                        if read.is_unmapped:
                            read.mapq = 0
                        yield read
                it = unset_unmapped_mapq(it)

            if options.set_nh and False:
                def set_nh(i):

                    for key, reads in itertools.groupby(i, lambda x: x.qname):
                        l = list(reads)
                        nh = len(l)
                        for read in l:
                            if not read.is_unmapped:
                                t = dict(read.tags)
                                t['NH'] = nh
                                read.tags = list(t.iteritems())
                            yield read
                it = set_nh(it)

            if options.set_sequence:
                def set_sequence(i):
                    for read in i:
                        # can't get at length of unmapped reads
                        if read.is_unmapped:
                            read.seq = "A"
                            read.qual = "F"
                        else:
                            read.seq = "A" * read.inferred_length
                            read.qual = "F" * read.inferred_length

                        yield read
                it = set_sequence(it)

            if options.strip is not None:
                def strip_sequence(i):
                    for read in i:
                        read.seq = None
                        yield read

                def check_sequence(reads):
                    if reads[0].seq is None:
                        return 'no sequence present'
                    return None

                def strip_quality(i):
                    for read in i:
                        read.qual = None
                        yield read

                def check_quality(reads):
                    if reads[0].qual is None:
                        return 'no quality information present'
                    return None

                def strip_match(i):
                    for read in i:
                        try:
                            nm = read.opt('NM')
                        except KeyError:
                            nm = 1
                        if nm == 0:
                            read.seq = None
                        yield read

                if options.strip == "sequence":
                    it = strip_sequence(it)
                    pre_check_f = check_sequence
                elif options.strip == "quality":
                    it = strip_quality(it)
                    pre_check_f = check_quality
                elif options.strip == "match":
                    it = strip_match(it)

            if options.unstrip:
                def buildReadDictionary(filename):
                    if not os.path.exists(filename):
                        raise OSError("file not found: %s" % filename)
                    fastqfile = pysam.Fastqfile(filename)
                    fastq2sequence = {}
                    for x in fastqfile:
                        if x.name in fastq2sequence:
                            raise ValueError(
                                "read %s duplicate - can not unstrip" % x.name)

                        fastq2sequence[x.name] = (x.sequence, x.quality)
                    return fastq2sequence

                if not options.fastq_pair1:
                    raise ValueError(
                        "please supply fastq file(s) for unstripping")
                fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
                if options.fastq_pair2:
                    fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

                def unstrip_unpaired(i):
                    for read in i:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                        yield read

                def unstrip_pair(i):
                    for read in i:
                        if read.is_read1:
                            read.seq, read.qual = fastq2sequence1[read.qname]
                        else:
                            read.seq, read.qual = fastq2sequence2[read.qname]
                        yield read

                if options.fastq_pair2:
                    it = unstrip_pair(it)
                else:
                    it = unstrip_unpaired(it)

            if options.set_nh:
                it = _bam2bam.SetNH(it)

            # keep first base of reads by changing the cigarstring to
            # '1M' and, in reads mapping to the reverse strand,
            # changes the pos to aend - 1
            if options.keep_first_base:
                def keep_first_base(i):
                    for read in i:
                        if read.is_reverse:
                            read.pos = read.aend - 1
                            read.cigarstring = '1M'
                        elif not read.is_unmapped:
                            read.cigarstring = '1M'
                        yield read
                it = keep_first_base(it)

            # read first read and check if processing should continue
            # only possible when not working from stdin
            if bamfile != "-":
                # get first read for checking pre-conditions
                first_reads = list(pysam_in.head(1))

                msg = pre_check_f(first_reads)
                if msg is not None:
                    if options.force:
                        E.warn('proccessing continues, though: %s' % msg)
                    else:
                        E.warn('processing not started: %s' % msg)
                        pysam_in.close()
                        pysam_out.close()
                        continue

            # continue processing till end
            for read in it:
                pysam_out.write(read)

            pysam_in.close()
            pysam_out.close()

        if options.inplace:
            # set date and file permissions according to original
            # Note: currently it will not update user and group.
            original = os.stat(bamfile)
            os.utime(tmpfile.name, (original.st_atime, original.st_mtime))
            os.chmod(tmpfile.name, original.st_mode)
            # move new file over original copy
            shutil.move(tmpfile.name, bamfile)
            # re-index
            pysam.index(bamfile)

    # write footer and output benchmark information.
    E.Stop()
Example #18
0
def buildPseudogenes(infiles, outfile, dbhandle):
    '''build a set of pseudogenes.

    Transcripts are extracted from the GTF file and designated as
    pseudogenes if:

    * the gene_type or transcript_type contains the phrase
      "pseudo". This taken is from the database.

    * the feature is 'processed_transcript' and has similarity to
      protein coding genes. Similarity is assessed by aligning the
      transcript and peptide set against each other with exonerate_.

    Pseudogenic transcripts can overlap with protein coding
    transcripts.

    Arguments
    ---------
    infiles : list
       Filenames of ENSEMBL geneset in :term:`gtf` format
       and associated peptide sequences in :term:`fasta` format.
    outfile : filename
       Output in :term:`gtf` format with inferred or annotated
       pseudogenes.
    dbandle : object
       Database handle for extracting transcript biotypes.
    '''

    infile_gtf, infile_peptides_fasta = infiles

    # JJ - there are also 'nontranslated_CDS', but no explanation of these
    if PARAMS["genome"].startswith("dm"):
        E.warn("Ensembl dm genome annotations only contain source"
               " 'pseudogenes' - skipping exonerate step")
        statement = """zcat %(infile_gtf)s
        |awk '$2 ~ /pseudogene/'
        | gzip
        > %(outfile)s"""
        P.run()
        return

    tmpfile1 = P.getTempFilename(shared=True)

    # collect processed transcripts and save as fasta sequences
    statement = '''
    zcat %(infile_gtf)s
    | awk '$2 ~ /processed/'
    | python %(scriptsdir)s/gff2fasta.py
            --is-gtf
            --genome-file=%(genome_dir)s/%(genome)s
            --log=%(outfile)s.log
    > %(tmpfile1)s
    '''

    P.run()

    if IOTools.isEmpty(tmpfile1):
        E.warn("no pseudogenes found")
        os.unlink(tmpfile1)
        P.touch(outfile)
        return

    model = "protein2dna"

    # map processed transcripts against peptide sequences
    statement = '''
    cat %(tmpfile1)s
    | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100
    --log=%(outfile)s.log
    "exonerate --target %%STDIN%%
              --query %(infile_peptides_fasta)s
              --model %(model)s
              --bestn 1
              --score 200
              --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\"
              --showalignment no --showsugar no --showcigar no --showvulgar no
    "
    | grep -v -e "exonerate" -e "Hostname"
    | gzip > %(outfile)s.links.gz
    '''

    P.run()

    os.unlink(tmpfile1)

    inf = IOTools.openFile("%s.links.gz" % outfile)
    best_matches = {}
    for line in inf:
        peptide_id, transcript_id, score = line[:-1].split("\t")
        score = int(score)
        if transcript_id in best_matches and \
           best_matches[transcript_id][0] > score:
            continue
        best_matches[transcript_id] = (score, peptide_id)

    inf.close()

    E.info("found %i best links" % len(best_matches))
    new_pseudos = set(best_matches.keys())

    cc = dbhandle.cursor()
    known_pseudos = set([x[0] for x in cc.execute(
        """SELECT DISTINCT transcript_id
        FROM transcript_info
        WHERE transcript_biotype like '%pseudo%' OR
        gene_biotype like '%pseudo%' """)])

    E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, "
           "intersection=%i" % (
               (len(new_pseudos),
                len(known_pseudos),
                len(new_pseudos.intersection(known_pseudos)))))

    all_pseudos = new_pseudos.union(known_pseudos)

    c = E.Counter()

    outf = IOTools.openFile(outfile, "w")
    inf = GTF.iterator(IOTools.openFile(infile_gtf))
    for gtf in inf:
        c.input += 1
        if gtf.transcript_id not in all_pseudos:
            continue
        c.output += 1
        outf.write("%s\n" % gtf)
    outf.close()

    E.info("exons: %s" % str(c))
Example #19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--methods",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=("filter", "keep-first-base", "set-nh",
                               "set-sequence", "strip-sequence",
                               "strip-quality", "unstrip",
                               "unset-unmapped-mapq", "downsample-single",
                               "downsample-paired"),
                      help="methods to apply [%default]")

    parser.add_option("--strip-method",
                      dest="strip_method",
                      type="choice",
                      choices=("all", "match"),
                      help="define which sequences/qualities to strip. "
                      "match means that stripping only applies to entries "
                      "without mismatches (requires NM tag to be present). "
                      "[%default]")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      action="append",
                      type="choice",
                      choices=('NM', 'CM', 'mapped', 'unique', "non-unique"),
                      help="filter method to apply to remove alignments "
                      "from a bam file. Multiple methods can be supplied "
                      "[%default]")

    parser.add_option("--reference-bam-file",
                      dest="reference_bam",
                      type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force processing. Some methods such "
                      "as strip/unstrip will stop processing if "
                      "they think it not necessary "
                      "[%default]")

    parser.add_option("--output-sam",
                      dest="output_sam",
                      action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--inplace",
                      dest="inplace",
                      action="store_true",
                      help="modify bam files in-place. Bam files need "
                      "to be given "
                      "as arguments. Temporary bam files are written "
                      "to /tmp [%default]")

    parser.add_option("--first-fastq-file",
                      "-1",
                      dest="fastq_pair1",
                      type="string",
                      help="fastq file with read information for first "
                      "in pair or unpaired. Used for unstripping sequence "
                      "and quality scores [%default]")

    parser.add_option("--second-fastq-file",
                      "-2",
                      dest="fastq_pair2",
                      type="string",
                      help="fastq file with read information for second "
                      "in pair. Used for unstripping sequence "
                      "and quality scores  [%default]")

    parser.add_option("--downsample",
                      dest="downsample",
                      type="int",
                      help="Number of reads to downsample to")

    parser.set_defaults(methods=[],
                        output_sam=False,
                        reference_bam=None,
                        filter_methods=[],
                        strip_method="all",
                        force=False,
                        inplace=False,
                        fastq_pair1=None,
                        fastq_pair2=None,
                        downsample=None,
                        random_seed=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)
    # random.seed(options.random_seed)
    bamfiles = []

    if options.stdin != sys.stdin:
        from_stdin = True
        bamfiles.append(options.stdin.name)
    else:
        from_stdin = False

    if options.inplace:
        bamfiles.extend(args)
        if len(bamfiles) == 0:
            raise ValueError(
                "please one or more bam-files as command line arguments")

        if "-" in bamfiles:
            raise ValueError(
                "can not read from stdin if ``--inplace`` is selected")

    if len(bamfiles) == 0:
        bamfiles = ["-"]

    to_stdout = False

    for bamfile in bamfiles:

        E.info('processing %s' % bamfile)

        if os.path.islink(bamfile):
            E.warn('ignoring link %s' % bamfile)
            continue

        if IOTools.isEmpty(bamfile):
            E.warn('ignoring empty file %s' % bamfile)
            continue

        # reading bam from stdin does not work with only the "r" tag
        pysam_in = pysam.AlignmentFile(bamfile, "rb")
        if bamfile == "-" or (from_stdin and bamfile == options.stdin.name):
            to_stdout = True
            if options.output_sam:
                pysam_out = pysam.AlignmentFile("-", "wh", template=pysam_in)
            else:
                pysam_out = pysam.AlignmentFile("-", "wb", template=pysam_in)
        else:
            if IOTools.isEmpty(bamfile):
                E.warn('skipping empty file %s' % bamfile)
                continue
            tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp")
            tmpfile.close()

            E.debug("writing temporary bam-file to %s" % tmpfile.name)
            pysam_out = pysam.AlignmentFile(tmpfile.name,
                                            "wb",
                                            template=pysam_in)

        if "filter" in options.methods:

            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter_methods:
                remove_mismatches = True

            elif "CM" in options.filter_methods:
                remove_mismatches = True
                colour_mismatches = True

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.AlignmentFile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = _bam2bam.filter_bam(pysam_in,
                                    pysam_out,
                                    pysam_ref,
                                    remove_nonunique="unique"
                                    in options.filter_methods,
                                    remove_unique="non-unique"
                                    in options.filter_methods,
                                    remove_contigs=None,
                                    remove_unmapped="mapped"
                                    in options.filter_methods,
                                    remove_mismatches=remove_mismatches,
                                    colour_mismatches=colour_mismatches)

            if pysam_ref:
                pysam_ref.close()

            # do not write to stdlog in the middle of a SAM/BAM stdout stream.
            if options.stdlog != options.stdout:
                E.info("category\tcounts\n%s\n" % c.asTable())
        else:

            # set up the modifying iterators
            it = pysam_in.fetch(until_eof=True)

            # function to check if processing should start
            pre_check_f = lambda x: None

            if "unset-unmapped-mapq" in options.methods:

                def unset_unmapped_mapq(i):
                    for read in i:
                        if read.is_unmapped:
                            read.mapq = 0
                        yield read

                it = unset_unmapped_mapq(it)

            if "set-sequence" in options.methods:

                def set_sequence(i):
                    for read in i:
                        # can't get at length of unmapped reads
                        if read.is_unmapped:
                            read.seq = "A"
                            read.qual = "F"
                        else:
                            read.seq = "A" * read.inferred_length
                            read.qual = "F" * read.inferred_length

                        yield read

                it = set_sequence(it)

            if "strip-sequence" in options.methods or "strip-quality" in \
               options.methods:

                def strip_sequence(i):
                    for read in i:
                        read.seq = None
                        yield read

                def check_sequence(reads):
                    if reads[0].seq is None:
                        return 'no sequence present'
                    return None

                def strip_quality(i):
                    for read in i:
                        read.qual = None
                        yield read

                def check_quality(reads):
                    if reads[0].qual is None:
                        return 'no quality information present'
                    return None

                def strip_match(i):
                    for read in i:
                        try:
                            nm = read.opt('NM')
                        except KeyError:
                            nm = 1
                        if nm == 0:
                            read.seq = None
                        yield read

                if options.strip_method == "all":
                    if "strip-sequence" in options.methods:
                        it = strip_sequence(it)
                        pre_check_f = check_sequence
                    elif "strip-quality" in options.methods:
                        it = strip_quality(it)
                        pre_check_f = check_quality
                elif options.strip_method == "match":
                    it = strip_match(it)

            if "unstrip" in options.methods:

                def buildReadDictionary(filename):
                    if not os.path.exists(filename):
                        raise OSError("file not found: %s" % filename)
                    fastqfile = pysam.FastxFile(filename)
                    fastq2sequence = {}
                    for x in fastqfile:
                        if x.name in fastq2sequence:
                            raise ValueError(
                                "read %s duplicate - can not unstrip" % x.name)

                        fastq2sequence[x.name] = (x.sequence, x.quality)
                    return fastq2sequence

                if not options.fastq_pair1:
                    raise ValueError(
                        "please supply fastq file(s) for unstripping")
                fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
                if options.fastq_pair2:
                    fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

                def unstrip_unpaired(i):
                    for read in i:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                        yield read

                def unstrip_pair(i):
                    for read in i:
                        if read.is_read1:
                            read.seq, read.qual = fastq2sequence1[read.qname]
                        else:
                            read.seq, read.qual = fastq2sequence2[read.qname]
                        yield read

                if options.fastq_pair2:
                    it = unstrip_pair(it)
                else:
                    it = unstrip_unpaired(it)

            if "set-nh" in options.methods:
                it = _bam2bam.SetNH(it)

            # keep first base of reads by changing the cigarstring to
            # '1M' and, in reads mapping to the reverse strand,
            # changes the pos to aend - 1
            # Needs to be refactored to make it more general
            # (last base, midpoint, ..)
            if "keep_first_base" in options.methods:

                def keep_first_base(i):
                    for read in i:
                        if read.is_reverse:
                            read.pos = read.aend - 1
                            read.cigarstring = '1M'
                        elif not read.is_unmapped:
                            read.cigarstring = '1M'
                        yield read

                it = keep_first_base(it)

            # read first read and check if processing should continue
            # only possible when not working from stdin
            # Refactoring: use cache to also do a pre-check for
            # stdin input.
            if bamfile != "-":
                # get first read for checking pre-conditions
                first_reads = list(pysam_in.head(1))

                msg = pre_check_f(first_reads)
                if msg is not None:
                    if options.force:
                        E.warn('proccessing continues, though: %s' % msg)
                    else:
                        E.warn('processing not started: %s' % msg)
                        pysam_in.close()
                        pysam_out.close()
                        continue

            if "downsample-single" in options.methods:

                if not options.downsample:
                    raise ValueError("Please provide downsample size")

                else:
                    down = SubsetBam(pysam_in=it,
                                     downsample=options.downsample,
                                     paired_end=None,
                                     single_end=True,
                                     random_seed=options.random_seed)
                    it = down.downsample_single()

            if "downsample-paired" in options.methods:

                if not options.downsample:
                    raise ValueError("Please provide downsample size")

                else:
                    down = SubsetBam(pysam_in=it,
                                     downsample=options.downsample,
                                     paired_end=True,
                                     single_end=None,
                                     random_seed=options.random_seed)
                    it = down.downsample_paired()

            # continue processing till end
            for read in it:
                pysam_out.write(read)

        pysam_in.close()
        pysam_out.close()

        if options.inplace:
            # set date and file permissions according to original
            # Note: currently it will not update user and group.
            original = os.stat(bamfile)
            os.utime(tmpfile.name, (original.st_atime, original.st_mtime))
            os.chmod(tmpfile.name, original.st_mode)
            # move new file over original copy
            shutil.move(tmpfile.name, bamfile)
            # re-index
            pysam.index(bamfile)

    # write footer and output benchmark information.
    E.Stop()
Example #20
0
File: bam2bam.py Project: SCV/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--methods", dest="methods", type="choice",
                      action="append",
                      choices=("filter",
                               "keep-first-base",
                               "set-nh",
                               "set-sequence",
                               "strip-sequence",
                               "strip-quality",
                               "unstrip",
                               "unset-unmapped-mapq"),
                      help="methods to apply [%default]")

    parser.add_option("--strip-method", dest="strip_method", type="choice",
                      choices=("all", "match"),
                      help="define which sequences/qualities to strip. "
                      "match means that stripping only applies to entries "
                      "without mismatches (requires NM tag to be present). "
                      "[%default]")

    parser.add_option("--filter-method", dest="filter_methods",
                      action="append", type="choice",
                      choices=('NM', 'CM', 'mapped', 'unique', "non-unique"),
                      help="filter method to apply to remove alignments "
                      "from a bam file. Multiple methods can be supplied "
                      "[%default]")

    parser.add_option("--reference-bam-file", dest="reference_bam",
                      type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--force-output", dest="force", action="store_true",
                      help="force processing. Some methods such "
                      "as strip/unstrip will stop processing if "
                      "they think it not necessary "
                      "[%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--inplace", dest="inplace", action="store_true",
                      help="modify bam files in-place. Bam files need "
                      "to be given "
                      "as arguments. Temporary bam files are written "
                      "to /tmp [%default]")

    parser.add_option(
        "--first-fastq-file", "-1", dest="fastq_pair1", type="string",
        help="fastq file with read information for first "
        "in pair or unpaired. Used for unstripping sequence "
        "and quality scores [%default]")

    parser.add_option(
        "--second-fastq-file", "-2", dest="fastq_pair2", type="string",
        help="fastq file with read information for second "
        "in pair. Used for unstripping sequence "
        "and quality scores  [%default]")

    parser.set_defaults(
        methods=[],
        output_sam=False,
        reference_bam=None,
        filter_methods=[],
        strip_method="all",
        force=False,
        inplace=False,
        fastq_pair1=None,
        fastq_pair2=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    bamfiles = []

    if options.stdin != sys.stdin:
        bamfiles.append(options.stdin.name)

    if options.inplace:
        bamfiles.extend(args)
        if len(bamfiles) == 0:
            raise ValueError(
                "please one or more bam-files as command line arguments")

        if "-" in bamfiles:
            raise ValueError(
                "can not read from stdin if ``--inplace`` is selected")

    if len(bamfiles) == 0:
        bamfiles = ["-"]

    for bamfile in bamfiles:

        E.info('processing %s' % bamfile)

        if os.path.islink(bamfile):
            E.warn('ignoring link %s' % bamfile)
            continue

        if IOTools.isEmpty(bamfile):
            E.warn('ignoring empty file %s' % bamfile)
            continue

        # reading bam from stdin does not work with only the "r" tag
        pysam_in = pysam.Samfile(bamfile, "rb")

        if bamfile == "-":
            if options.output_sam:
                pysam_out = pysam.Samfile("-", "wh", template=pysam_in)
            else:
                pysam_out = pysam.Samfile("-", "wb", template=pysam_in)
        else:
            if IOTools.isEmpty(bamfile):
                E.warn('skipping empty file %s' % bamfile)
                continue
            tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp")
            tmpfile.close()

            E.debug("writing temporary bam-file to %s" % tmpfile.name)
            pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in)

        if "filter" in options.methods:

            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter_methods:
                remove_mismatches = True

            elif "CM" in options.filter_methods:
                remove_mismatches = True
                colour_mismatches = True

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.Samfile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = _bam2bam.filter_bam(
                pysam_in, pysam_out, pysam_ref,
                remove_nonunique="unique" in options.filter_methods,
                remove_unique="non-unique" in options.filter_methods,
                remove_contigs=None,
                remove_unmapped="mapped" in options.filter_methods,
                remove_mismatches=remove_mismatches,
                colour_mismatches=colour_mismatches)

            options.stdlog.write("category\tcounts\n%s\n" % c.asTable())
        else:

            # set up the modifying iterators
            it = pysam_in.fetch(until_eof=True)

            # function to check if processing should start
            pre_check_f = lambda x: None

            if "unset-unmapped-mapq" in options.methods:
                def unset_unmapped_mapq(i):
                    for read in i:
                        if read.is_unmapped:
                            read.mapq = 0
                        yield read
                it = unset_unmapped_mapq(it)

            if "set-sequence" in options.methods:
                def set_sequence(i):
                    for read in i:
                        # can't get at length of unmapped reads
                        if read.is_unmapped:
                            read.seq = "A"
                            read.qual = "F"
                        else:
                            read.seq = "A" * read.inferred_length
                            read.qual = "F" * read.inferred_length

                        yield read
                it = set_sequence(it)

            if "strip-sequence" in options.methods or "strip-quality" in \
               options.methods:
                def strip_sequence(i):
                    for read in i:
                        read.seq = None
                        yield read

                def check_sequence(reads):
                    if reads[0].seq is None:
                        return 'no sequence present'
                    return None

                def strip_quality(i):
                    for read in i:
                        read.qual = None
                        yield read

                def check_quality(reads):
                    if reads[0].qual is None:
                        return 'no quality information present'
                    return None

                def strip_match(i):
                    for read in i:
                        try:
                            nm = read.opt('NM')
                        except KeyError:
                            nm = 1
                        if nm == 0:
                            read.seq = None
                        yield read

                if options.strip_method == "all":
                    if "strip-sequence" in options.methods:
                        it = strip_sequence(it)
                        pre_check_f = check_sequence
                    elif "strip-quality" in options.methods:
                        it = strip_quality(it)
                        pre_check_f = check_quality
                elif options.strip_method == "match":
                    it = strip_match(it)

            if "unstrip" in options.methods:
                def buildReadDictionary(filename):
                    if not os.path.exists(filename):
                        raise OSError("file not found: %s" % filename)
                    fastqfile = pysam.FastxFile(filename)
                    fastq2sequence = {}
                    for x in fastqfile:
                        if x.name in fastq2sequence:
                            raise ValueError(
                                "read %s duplicate - can not unstrip" % x.name)

                        fastq2sequence[x.name] = (x.sequence, x.quality)
                    return fastq2sequence

                if not options.fastq_pair1:
                    raise ValueError(
                        "please supply fastq file(s) for unstripping")
                fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
                if options.fastq_pair2:
                    fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

                def unstrip_unpaired(i):
                    for read in i:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                        yield read

                def unstrip_pair(i):
                    for read in i:
                        if read.is_read1:
                            read.seq, read.qual = fastq2sequence1[read.qname]
                        else:
                            read.seq, read.qual = fastq2sequence2[read.qname]
                        yield read

                if options.fastq_pair2:
                    it = unstrip_pair(it)
                else:
                    it = unstrip_unpaired(it)

            if "set-nh" in options.methods:
                it = _bam2bam.SetNH(it)

            # keep first base of reads by changing the cigarstring to
            # '1M' and, in reads mapping to the reverse strand,
            # changes the pos to aend - 1
            # Needs to be refactored to make it more general
            # (last base, midpoint, ..)
            if "keep_first_base" in options.methods:
                def keep_first_base(i):
                    for read in i:
                        if read.is_reverse:
                            read.pos = read.aend - 1
                            read.cigarstring = '1M'
                        elif not read.is_unmapped:
                            read.cigarstring = '1M'
                        yield read
                it = keep_first_base(it)

            # read first read and check if processing should continue
            # only possible when not working from stdin
            # Refactoring: use cache to also do a pre-check for
            # stdin input.
            if bamfile != "-":
                # get first read for checking pre-conditions
                first_reads = list(pysam_in.head(1))

                msg = pre_check_f(first_reads)
                if msg is not None:
                    if options.force:
                        E.warn('proccessing continues, though: %s' % msg)
                    else:
                        E.warn('processing not started: %s' % msg)
                        pysam_in.close()
                        pysam_out.close()
                        continue

            # continue processing till end
            for read in it:
                pysam_out.write(read)

            pysam_in.close()
            pysam_out.close()

        if options.inplace:
            # set date and file permissions according to original
            # Note: currently it will not update user and group.
            original = os.stat(bamfile)
            os.utime(tmpfile.name, (original.st_atime, original.st_mtime))
            os.chmod(tmpfile.name, original.st_mode)
            # move new file over original copy
            shutil.move(tmpfile.name, bamfile)
            # re-index
            pysam.index(bamfile)

    # write footer and output benchmark information.
    E.Stop()
def loadZinba(infile, outfile, bamfile,
              tablename=None,
              controlfile=None):
    '''load Zinba results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.zinba`.

    If no peaks were predicted, an empty table is created.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.

    This method uses the refined peak locations.

    Zinba peaks can be overlapping. This method does not merge
    overlapping intervals.

    Zinba calls peaks in regions where there are many reads inside
    the control. Thus this method applies a filtering step 
    removing all intervals in which there is a peak of
    more than readlength / 2 height in the control.

    .. note:

       Zinba calls peaks that are overlapping.

    '''

    track = P.snip(os.path.basename(infile), ".zinba")
    folder = os.path.dirname(infile)

    infilename = infile + ".peaks"

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig", "start", "end",
        "npeaks", "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue", "fold", "qvalue",
        "macs_summit", "macs_nprobes",
    )) + "\n")

    counter = E.Counter()

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
    elif IOTools.isEmpty(infilename):
        E.warn("no data in %s" % infilename)
    else:
        # filter peaks
        shift = getPeakShiftFromZinba(infile)
        assert shift is not None, \
            "could not determine peak shift from Zinba file %s" % infile

        E.info("%s: found peak shift of %i" % (track, shift))

        samfiles = [pysam.Samfile(bamfile, "rb")]
        offsets = [shift / 2]

        if controlfile:
            controlfiles = [pysam.Samfile(controlfile, "rb")]
            readlength = BamTools.estimateTagSize(controlfile)
            control_max_peakval = readlength // 2
            E.info("removing intervals in which control has peak higher than %i reads" %
                   control_max_peakval)
        else:
            controlfiles = None

        id = 0

        # get thresholds
        max_qvalue = float(PARAMS["zinba_fdr_threshold"])

        with IOTools.openFile(infilename, "r") as ins:
            for peak in WrapperZinba.iteratePeaks(ins):

                # filter by qvalue
                if peak.fdr > max_qvalue:
                    counter.removed_qvalue += 1
                    continue

                assert peak.refined_start < peak.refined_end

                # filter by control
                if controlfiles:
                    npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig,
                                                                                     peak.refined_start,
                                                                                     peak.refined_end,
                                                                                     controlfiles,
                                                                                     offsets)

                    if peakval > control_max_peakval:
                        counter.removed_control += 1
                        continue

                # output peak
                npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig,
                                                                                 peak.refined_start,
                                                                                 peak.refined_end,
                                                                                 samfiles,
                                                                                 offsets)

                outtemp.write("\t".join(map(str, (
                    id, peak.contig, peak.refined_start, peak.refined_end,
                    npeaks, peakcenter, length, avgval, peakval, nreads,
                    1.0 - peak.posterior, 1.0, peak.fdr,
                    peak.refined_start + peak.summit - 1,
                    peak.height))) + "\n")
                id += 1
                counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = IOTools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_intervals" % track

    statement = '''
    cgat csv2db %(csv2db_options)s 
              --allow-empty-file
              --add-index=interval_id 
              --add-index=contig,start
              --table=%(tablename)s 
    < %(tmpfilename)s 
    > %(outfile)s
    '''

    P.run()

    os.unlink(tmpfilename)
Example #22
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that
    all sequences are output and MAST curves can be computed. 

    10000 is a heuristic.
    '''
    to_cluster = True

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.isEmpty(dbfile):
        P.touch(outfile)
        return

    if not os.path.exists(controlfile):
        raise P.PipelineError(
            "control file %s for %s does not exist" % (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.getTempDir(".")
    tmpfile = P.getTempFilename(".")

    for motiffile in motiffiles:
        if IOTools.isEmpty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s 
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s 
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

    statement = "gzip < %(tmpfile)s > %(outfile)s"
    P.run()

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
Example #23
0
def BedFileVenn(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''
    bed1, bed2 = infiles
    liver_name = P.snip(os.path.basename(liver), ".replicated.bed")
    testes_name = P.snip(os.path.basename(testes), ".replicated.bed")
    to_cluster = True

    statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed;
                   echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; 
                   echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; 
                   echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; 
                   echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s;                   
                   sed -i '{N;s/\\n/\\t/g}' %(outfile)s; '''

    if len(infiles) == 1:
        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if IOTools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)