def parseGumby(gumbyFile, exonFile, baseSeq): # parses gumbyFile, removes things that overlap exons and gumbies that consist only of gaps on baseSeq # returns a list of gumbyBlocks infile = open(gumbyFile, "r") exons = [] if exonFile!=None: fh = open(exonFile, "r") for l in fh: fs = l.split() if fs[0].lower()!=baseSeq: continue exons.append([ int(fs[3]), int(fs[4]) ] ) # print exons re1 = compile("[a-z]+[ ]+[0-9]+[ ]+[0-9]+") seqs = {} pos = {} i = -1 resultLst = alignment.Alignment() for l in infile: l = l.strip() l = l.replace("*","-") l = l.replace("<", "-") l = l.replace(">", "-") if l.startswith("start"): if i!=-1: resultLst.extend(procData(baseSeq, exons, i, seqs, pos, pval, length, score)) f = l.split() pval = float(f[-1]) length = int(f[6].strip(",")) score = int(f[8].strip(",")) i+=1 seqs={} if re1.match(l): f = l.split() name = f[0] start = int(f[1])-1 end = int(f[2])-1 seq = f[3] if name not in seqs: faseq = Fasta.FastaSeq(name, seq) faseq.chrom = name faseq.start = start faseq.end = end seqs[name] = faseq else: faseq = seqs[f[0]] faseq.nucl += f[3] pos[name] = (name, start,end) resultLst.extend(procData(baseSeq, exons, i, seqs, pos, pval, length, score)) return resultLst