Exemple #1
0
def main(args):

    logging.info(" ========> parsing GFFFile")

    logging.info("ARGS %s"%args)

    gfffilename   = args.gfffile
    fastafilename = args.fastafile

    gff_factory = GFFFactory(gfffilename)
    fastafile   = FastaFile(fastafilename)


    seqs = {}

    seq = fastafile.nextSeq()

    while seq is not None:
        seqs[seq['id']] = seq['seq']
        seq = fastafile.nextSeq()


    try:
        gff = gff_factory.nextGFF()
    except Exception, e:
        sys.stderr.write("ERROR: %s\n"%e)
Exemple #2
0
def main(args):

    logging.info(" ========> process_alnfile.py")

    logging.info("ARGS %s" % args)

    ff = FastaFile(args.fastafile)

    seqs = []

    seq = ff.nextSeq()

    #  We first want to flag which sequences are present and which are not
    #  get the ids
    #  sort alphbetically
    #  print the ids joined together

    #  We then want to get some stats on the alignment
    #  Number of sequences
    #  Length
    #  Coverage for each sequence
    #  Get consensus
    #  PID to consensus for each sequence
    #  Mismatches for each sequence

    #  We then want some specifics
    #  Positions where 2 sequences are the same and the other is not

    ids = []

    while seq is not None:
        ids.append(seq['id'])
        seq = ff.nextSeq()

    pid = 101

    if args.pid is not None:
        pid = int(args.pid)

    stats = ff.calcStats()

    if stats['av_ungapped_percentid'] < pid:

        print "\nFILESTATS\t%s\tNumber_of_seqs\t%d\tIDS\t%s" % (
            args.fastafile, len(ids), ','.join(ids))
        print stats['outstr']
        print stats['avpercentid']
Exemple #3
0
def main(args):

    blatobj = BlatFile(args.blatfile)
    fastaobj = FastaFile(args.fastafile)

    # Parse the fasta file
    seqs = []
    ids = {}

    seq = fastaobj.nextSeq()

    while seq is not None:
        seqs.append(seq)
        ids[seq['id']] = len(seq['seq'])
        seq = fastaobj.nextSeq()

    feat = blatobj.nextFeature()

    tmpfeat = []
    tmpqid = None

    foundids = {}
    foundhits = {}

    while feat:

        if tmpqid is not None and len(tmpfeat) > 0:
            if tmpqid != feat.qid:
                print
                tophit = getBestHit(tmpfeat)
                foundids[tophit.qid] = 1
                foundhits[tophit.qid] = tophit
                tmpfeat = []

        tmpfeat.append(feat)
        tmpqid = feat.qid

        feat = blatobj.nextFeature()

    for id in ids:
        if id not in foundids:
            print "MISSINGID %s LEN %d" % (id, ids[id])
        else:
            tophit = foundhits[id]
            print "FOUNDID\t%d\t%d\t%d\t%s" % (tophit.pid, tophit.qcov,
                                               tophit.hcov, tophit)
Exemple #4
0
def main(args):

    logging.info(" ========> filter_fastafile.py")

    logging.info("ARGS %s" % args)

    ff = FastaFile(args.fastafile)
    filterstr = args.str
    seqs = []

    seq = ff.nextSeq()

    while seq is not None:
        seqs.append(seq)
        seq = ff.nextSeq()

    newseqs = FastaFilter.filterById(seqs, args.str)

    print FastaFile.toString(newseqs)
Exemple #5
0
def main(args):

    logging.info(" ========> filter_fastafile.py")

    logging.info("ARGS %s"%args)

    ff = FastaFile(args.fastafile)
    stub = args.stub

    flen = open(stub+".chrlen",'w')
    blen = open(stub+ '.cytoband.txt','w')

    seq = ff.nextSeq()


    while seq is not None:
        flen.write("%s\t%d\n"%(seq['id'],seq['len']))
        blen.write("%s\t0\t%d\tband0\tband0\n"%(seq['id'],seq['len']))
        seq = ff.nextSeq()

    flen.close()
    blen.close()
Exemple #6
0
def main(args):

    logging.info(" ========> Converting mummer delta format for %s %s %s" %
                 (args.deltafile, args.reffile, args.queryfile))

    logging.info("ARGS %s" % args)

    ref = FastaFile(args.reffile)
    qry = FastaFile(args.queryfile)
    gff = GFFFactory(args.gfffile)

    g = gff.nextGFF()

    gffs = {}

    while g is not None:

        if g.type2 == "CDS":
            #print "QID %s %s"%(g.qid,g.type2)
            if g.qid not in gffs:
                gffs[g.qid] = []

            gffs[g.qid].append(g)

        g = gff.nextGFF()

    refseqs = {}
    qryseqs = {}

    seq = ref.nextSeq()

    while seq is not None:
        refseqs[seq['id']] = seq
        seq = ref.nextSeq()

    seq = qry.nextSeq()

    while seq is not None:
        qryseqs[seq['id']] = seq
        seq = qry.nextSeq()

    fh = open(args.deltafile)

    alns = {}
    lnum = 0

    line = fh.readline()

    id1 = None
    id2 = None
    len1 = None
    len2 = None

    while line != "":  # Can't use for line in fh: because we read the alignment in chunks

        lnum = lnum + 1

        line = line.rstrip('\n')
        ff = line.split(' ')

        if lnum == 1:
            """  The first line lists the two original input files separated by a space."""

            if1 = ff[0]
            if2 = ff[1]

            print "Input files [%s][%s]\n" % (if1, if2)

        elif lnum == 2:
            """ The second line specifies the alignment data type, either NUCMER or "PROMER"""

            alntype = ff[0]

            if alntype != "NUCMER":
                raise Exception(
                    "Only NUCMER alignments are currently parsed - we have [%s]"
                    % alntyp)

        else:
            """ Every grouping of alignment regions have a header, just like the cluster's header in the .cluster file. 
            This is a FASTA style header and lists the two sequences that produced the following alignments after a '>' and separated by a space.
            After the two sequences are the lengths of those sequences in the same order. 
            
            An example header might look like: >tagA1 tagB1 500 2000000   """

            if ff[0].startswith(">"):
                id1 = ff[0].replace(">", '')
                id2 = ff[1]

                len1 = int(ff[2])
                len2 = int(ff[3])

                #print "IDs %s %s %d %d"%(id1,id2,len1,len2)

            else:
                #print "Parsing %s"%line
                """ The four digits are the start and end in the reference sequence respectively and the start and end in the query sequence respectively. 

                These coordinates are always measured in DNA bases regardless of the alignment data type. 
                
                The three digits after the starts and stops are: 
                the number of errors (non-identities), 
                similarity errors (non- positive match scores) 
                non-alpha characters in the sequence (used to count stop-codons i promer data). 
                
                An example header might look like: 5198 22885 5389 23089 20 20 0  """

                rstart = int(ff[0])
                rend = int(ff[1])
                qstart = int(ff[2])
                qend = int(ff[3])

                qstrand = 1
                hstrand = 1

                if rend < rstart:
                    qstrand = -1
                else:
                    qstrand = 1

                if qend < qstart:
                    hstrand = -1
                else:
                    hstrand = 1

                #print "Strands %d %d"%(qstrand,hstrand)

                errors = int(ff[4])
                simerrs = int(ff[5])
                nonalpha = int(ff[6])

                if id1 not in refseqs:
                    raise Exception(
                        "Can't find reference sequence [%s] in ref file [%s]" %
                        (id1, args.reffile))

                if id2 not in qryseqs:
                    raise Exception(
                        "Can't find query sequence [%s] in query file [%s]" %
                        (id2, args.queryfile))
                rseq = refseqs[id1]
                qseq = qryseqs[id2]

                #print "Found alignment header %s %d %d ::  %s %d %d"%(id1,rstart,rend,id2,qstart,qend)
                """ Each of these headers is followed by a string of signed digits, one per line, with the final line before the next header equaling 0 (zero). 

                Each digit represents the distance to the next insertion in the reference (positive int) or deletion in the reference (negative int), 
                as measured in DNA bases or amino acids depending on the alignment data type.
                
                For example, with 'nucmer' the delta sequence (1, -3, 4, 0) would represent 
                - an insertion at positions 1 and 7 in the reference sequence and 
                - an insertion at position 3 in the query sequence. 
                
                Or with letters: A = acgtagctgag$ B = cggtagtgag$ Delta = (1, -3, 4, 0) A = acg.tagctgag$ B = .cggtag.tgag$    """

                count = fh.readline()
                count = count.rstrip('\n')
                count = int(count)

                tmprseq = rseq['seq']
                tmpqseq = qseq['seq']

                if rend > rstart:
                    tmprseq = tmprseq[rstart - 1:rend - 1]
                else:
                    tmprseq = tmprseq[rend:rstart]
                    tmprseq = reverse_complement(tmprseq)

                if qend > qstart:
                    tmpqseq = tmpqseq[qstart - 1:qend - 1]
                else:
                    tmpqseq = tmpqseq[qend:qstart]
                    tmpqseq = reverse_complement(tmpqseq)

                insertpos = 0

                while count != 0:
                    if count < 0:
                        """ This is an insertion in the query sequence so we put a - in the ref"""

                        insertpos = insertpos + abs(count)
                        tmprseq = tmprseq[:insertpos -
                                          1] + "-" + tmprseq[insertpos - 1:]

                    elif count > 0:
                        """ This is an insertion in the reference sequence """
                        insertpos = insertpos + abs(count)
                        tmpqseq = tmpqseq[:insertpos -
                                          1] + "-" + tmpqseq[insertpos - 1:]

                    count = fh.readline()
                    count = count.rstrip('\n')
                    count = int(count)

                seq1 = {}
                seq2 = {}
                seq1['id'] = id1
                seq2['id'] = id2
                seq1['seq'] = tmprseq
                seq2['seq'] = tmpqseq

                if (seq1 != seq2 and id1 == "GG739696.1"):
                    print prettyPrint([seq1, seq2])

                if id1 not in alns:
                    alns[id1] = []

                tmpgff = Feature()

                tmpgff.qid = id1
                tmpgff.qstart = rstart
                tmpgff.qend = rend

                #print "Strand %d %d"%(qstrand,hstrand)

                tmpgff.hitattr['qseq'] = seq1
                tmpgff.hitattr['hseq'] = seq2
                tmpgff.hitattr['hid'] = id1

                tmpgff.hitattr['insertpos'] = insertpos

                #alns[id1].append([seq1,seq2])
                alns[id1].append(tmpgff)

        line = fh.readline()

    #for id in alns:
    #print id
    #for gff in alns[id]:
    #print "%s - %s"%( tmpgff.qid,tmpgff.hitattr['hid'])

    gnum = 1

    for id in gffs:
        for g in gffs[id]:

            outstr = []

            name = g.hitattr['Name']
            prod = g.hitattr['product']

            #for h in g.hitattr:
            #    print "%s %s"%(h,g.hitattr[h])

            found = False
            foundgff = None
            status = "NEW"

            if id in alns:
                for tmpgff in alns[id]:

                    if g.overlaps(tmpgff):

                        if tmpgff.contains(g):
                            #print "Contained Seq qstart/end %d %d"%(tmpgff.qstart,tmpgff.qend)
                            found = True
                            foundgff = tmpgff
                        else:

                            ostart = g.qstart
                            oend = g.qend

                            if tmpgff.qstart > g.qstart:
                                ostart = tmpgff.qstart

                            if tmpgff.qend < g.qend:
                                oend = tmpgff.qend

                            frac = int(100 * (oend - ostart + 1) /
                                       (g.qend - g.qstart + 1))

                            status = "PARTALIGN"
                            outstr.append(
                                "============1 Processing gene %d %s %s" %
                                (gnum, name, prod))
                            outstr.append(
                                "Contig coords from gff file %s %d-%d" %
                                (g.qid, g.qstart, g.qend))
                            outstr.append(
                                "Partial overlap of %d percent overlap coords are %d %d"
                                % (frac, ostart, oend))

            if not found:
                if status == "NEW":
                    status = "NOALIGN"
                    outstr.append("============2 Processing gene %d %s %s" %
                                  (gnum, name, prod))
                    outstr.append(
                        "Contig coords from gff file %s %d-%d %s %s" %
                        (g.qid, g.qstart, g.qend, name, prod))
                    outstr.append(
                        "ERROR: No align for %s %s qstart/end %d %d %s" %
                        (name, tmpgff.qid, tmpgff.qstart, tmpgff.qend, prod))
            else:
                if qstrand == -1:
                    status = "REVSTRAND"
                    outstr.append("===========3 Processing gene %d %s %s" %
                                  (gnum, name, prod))
                    outstr.append(
                        "Contig coords from gff file %s %d-%d %s %s" %
                        (g.qid, g.qstart, g.qend, name, prod))
                    outstr.append(
                        "ERROR: can't deal with reverse strand reference alignments"
                    )
                else:
                    gstrand = g.strand
                    gstart = g.qstart
                    gend = g.qend

                    astrand = foundgff.strand
                    astart = foundgff.qstart
                    aend = foundgff.qend

                    apos1 = findAlnPos(foundgff, gstart)
                    apos2 = findAlnPos(foundgff, gend)

                    if gstrand == 1:
                        qseq = foundgff.hitattr['qseq']['seq'][apos1:apos2]
                        hseq = foundgff.hitattr['hseq']['seq'][apos1:apos2]
                    else:
                        qseq = foundgff.hitattr['qseq']['seq'][apos1 +
                                                               1:apos2 + 1]
                        hseq = foundgff.hitattr['hseq']['seq'][apos1 +
                                                               1:apos2 + 1]

                        qseq = reverse_complement(qseq)
                        hseq = reverse_complement(hseq)

                    if qseq != hseq:
                        status = "MUTATION"
                        outstr.append("===========4 Processing gene %d %s %s" %
                                      (gnum, name, prod))

                        #print "GFF %s %s %d %d %s %s"%(g.qid,g.hid,g.qstart,g.qend,name,prod)

                        outstr.append("DNA alignment\n")
                        tmpstr = prettyPrint([{
                            'id': id1,
                            'seq': qseq
                        }, {
                            'id': id2,
                            'seq': hseq
                        }])
                        tmpff = tmpstr.split('\n')
                        for f in tmpff:
                            outstr.append(f)

                        qpep = translate(qseq)
                        hpep = translate(hseq)

                        tmpstr = prettyPrint([{
                            'id': id1,
                            'seq': qpep
                        }, {
                            'id': id2,
                            'seq': hpep
                        }])
                        outstr.append("PEP alignment\n")
                        tmpff = tmpstr.split('\n')
                        for f in tmpff:
                            outstr.append(f)

                        #print "GFF start-end strand %d-%d %d %s %s"%(gstart,gend,gstrand,name,prod)
                        #print "ALN start-end strand %d-%d %d %s %s"%(astart,aend,astrand,name,prod)

                        #print "POS %d %d",(apos1,apos2)

                        #print "QSEQ %s"%qseq
                        #print "HSEQ %s"%hseq

                        #print "QPEP %s"%qpep
                        #print "HPEP %s"%hpep
                    else:
                        status = "IDENTICAL"
                        outstr.append(
                            "============5 Processing gene %d %s %s" %
                            (gnum, name, prod))
                        outstr.append("NO CHANGE for this alignment %s %s %s" %
                                      (tmpgff.qid, name, prod))

            for i in outstr:
                print "%-15s %s" % (status, i)
            print "\n"
            gnum = gnum + 1
Exemple #7
0
def main(args):

    ff = FastaFile(args.fastafile)
    seqs = []
    lens = []
    bins = {}

    seq = ff.nextSeq()
    binsize = int(args.binsize)
    minlen = int(args.minlen)

    print minlen
    # 1. Number of sequences
    # 2. Array of lengths
    # 3. Median
    # 4. Bins
    # 5. Distribution
    # 6. Translate

    totlen = 0

    while seq is not None:
        if seq['len'] >= minlen:
            #print "LEN\t%d\t%d"%(minlen,seq['len'])
            lens.append(seq['len'])
            totlen = totlen + seq['len']

            bin = int(seq['len'] / binsize)

            if bin not in bins:
                bins[bin] = 0

            bins[bin] = bins[bin] + 1

            #pep = SeqUtils.translate(seq['seq'])
            #pep = re.sub(r'(.{80})',r'\1\n',pep)
            #print ">%s\n%s"%(seq['id'],pep)

            seqs.append(seq)

        else:
            print "MIN\t%d\t%d" % (minlen, seq['len'])

        seq = ff.nextSeq()

    sortedseqs = sorted(seqs, key=lambda k: k['len'])

    median = None
    n50 = None
    tmplen = 0

    seqnum = len(seqs)

    i = 0

    for seq in sortedseqs:
        i = i + 1

        if n50 is None and tmplen > totlen / 2:
            n50 = seq['len']

        if median is None and i > seqnum / 2:
            median = seq['len']

        #print seq['len'],seq['id']

        tmplen = tmplen + seq['len']

    i = 0

    cumul = {}
    tmpcount = 0
    percent = 0

    for key in sorted(bins):
        count = bins[key]
        tmpcount = tmpcount + count

        percent = int(100 * tmpcount / seqnum)
        cumul[percent] = key * binsize
        print tmpcount, seqnum, percent, key * binsize

    mean = int(totlen / seqnum)

    print("Num\t%d\tN50\t%d\tMedian\t%d\tMean\t%d" %
          (seqnum, n50, median, mean))

    for key in sorted(bins):
        value = bins[key]


#        print("%d\t%d"%(binsize*key,value))

    for key in sorted(cumul):
        value = cumul[key]
        print("%d\t%d" % (key, value))
Exemple #8
0
import re
import os
import sys
import unittest

from datamodel.factory.FastaFile import FastaFile

import importlib

ff = FastaFile(sys.argv[1])
id = sys.argv[2]

seq = ff.nextSeq()

while seq is not None:
    if seq['id'] == id:
        str = FastaFile.toString([seq])
        print str
        exit()
    seq = ff.nextSeq()
Exemple #9
0
def main(args):

    logging.info(" ========> trim_alnfile.py")

    logging.info("ARGS %s"%args)

    ff = FastaFile(args.fastafile)

    seqs = []

    seq = ff.nextSeq()

    while seq is not None:

        if not seq['id'].startswith('7'):
           seqs.append(seq)   

        seq = ff.nextSeq()


    seqlen  = len(seqs[0]['seq'])
    newseqs = []


    prof = ff.calcProfile(seqs)
    print prof

    if args.trim :
      j = 0
      while j < len(seqs):
        print ">%s\n%s"%(seqs[j]['id'], seqs[j]['seq'])
        j = j + 1

    exit()

    j = 0
    while j < len(seqs):
      newseqs.append("")
      j = j + 1

    i = 0
    
    while i < seqlen:

       j     = 0
       count = 0 

       while j < len(seqs):

          if seqs[j]['seq'][i] == '-':
            count = count + 1

          j = j+1
      

       if count < len(seqs):
         j = 0;
         while j < len(seqs):
           newseqs[j]  = newseqs[j] + seqs[j]['seq'][i]
           j = j + 1

       i = i + 1


    
    j = 0
    while j < len(seqs):
        print ">%s\n%s"%(seqs[j]['id'], newseqs[j])
        j = j + 1