예제 #1
0
def parseGumby(gumbyFile, exonFile, baseSeq):
# parses gumbyFile, removes things that overlap exons and gumbies that consist only of gaps on baseSeq
# returns a list of gumbyBlocks

    infile = open(gumbyFile, "r")

    exons = []
    if exonFile!=None:
        fh = open(exonFile, "r")
        for l in fh:
            fs = l.split()
            if fs[0].lower()!=baseSeq:
                continue
            exons.append([ int(fs[3]), int(fs[4]) ] )
    # print exons

    re1 = compile("[a-z]+[ ]+[0-9]+[ ]+[0-9]+")
    seqs = {}
    pos = {}
    i = -1

    resultLst = alignment.Alignment()
    for l in infile:
        l = l.strip()
        l = l.replace("*","-")
        l = l.replace("<", "-")
        l = l.replace(">", "-")
        if l.startswith("start"):
            if i!=-1:
                resultLst.extend(procData(baseSeq, exons, i, seqs, pos, pval, length, score))
            f = l.split()
            pval = float(f[-1])
            length = int(f[6].strip(","))
            score = int(f[8].strip(","))
            i+=1
            seqs={}

        if re1.match(l):
            f = l.split()
            name = f[0]
            start = int(f[1])-1
            end = int(f[2])-1

            seq = f[3]
            if name not in seqs:
                faseq = Fasta.FastaSeq(name, seq)
                faseq.chrom = name
                faseq.start = start
                faseq.end = end
                seqs[name] = faseq
            else:
                faseq = seqs[f[0]] 
                faseq.nucl += f[3]
            pos[name] = (name, start,end)

    resultLst.extend(procData(baseSeq, exons, i, seqs, pos, pval, length, score))
    return resultLst