def parseLine(self,line): line = line.rstrip('\n') ff = line.split('\t') #psLayout version 3 #match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStart tStarts #match match count bases count bases name size start end name size start end count #--------------------------------------------------------------------------------------------------------------------------------------------------------------- #236 0 0 0 0 0 0 0 + TRINITY_DN4669_c0_g1_i1 237 0 236 Gm16144_ENSMUST00000131093 1843 1272 1508 1 236, 0, 1272, #179 0 0 0 0 0 0 0 - TRINITY_DN4615_c0_g1_i1 317 0 179 Hdhd3_ENSMUST00000037820 2977 0 179 1 179, 138, 0, #183 0 0 0 0 0 0 0 + TRINITY_DN4601_c0_g1_i1 219 36 219 Atp6v1a_ENSMUST00000130036 40052 2211 2394 1 f = Feature() match = int(ff[0]) mismatch = int(ff[1]) strand = ff[8] qid = ff[9] qlen = int(ff[10]) qstart = int(ff[11]) qend = int(ff[12]) hid = ff[13] hlen = int(ff[14]) hstart = int(ff[15]) hend = int(ff[16]) f.qid = qid f.type1 = 'blat' f.type2 = 'blat' f.qstart = qstart f.qend = qend f.hid = hid f.hstart = hstart f.hend = hend f.score = int(100*match/qlen) f.qlen = qlen f.hlen = hlen f.hitattr['match'] = match f.hitattr['mismatch'] = mismatch if strand != ".": if strand == "+": f.strand = 1 elif strand == 1: f.strand = 1 elif strand == "-": f.strand = -1 elif strand == -1: f.strand = -1 return f
def postProcessOutput(self): super(BlastOutput6ParserAnalysis,self).postProcessOutput() data = {} file = self.input_files[0] with open(file) as fp: for line in fp: line = line.rstrip('\n') ff = line.split('\t') qid = ff[0] hid = ff[1] pid = float(ff[2]) alnlen = ff[3] mm = int(ff[4]) gaps = int(ff[5]) qstart = int(ff[6]) qend = int(ff[7]) hstart = int(ff[8]) hend = int(ff[9]) exval = float(ff[10]) score = float(ff[11]) feat = Feature() feat.qid = qid feat.qstart = qstart feat.qend = qend feat.hid = hid feat.hstart = hstart feat.hend = hend feat.pid = pid feat.score = score feat.mm = mm feat.gaps = gaps feat.exval = exval if len(ff) > 12: feat.qlen = int(ff[12]) feat.hlen = int(ff[13]) feat.qseq = ff[14] feat.hseq = ff[15] if not qid in data: data[qid] = [] tmp = data[qid] tmp.append(feat) self.data = data
def parseBlastOutput6(file): hits = {} prev = None with open(file) as fp: for line in fp: line = line.rstrip('\n') ff = line.split('\t') qid = ff[0] hid = ff[1] pid = float(ff[2]) alnlen = ff[3] mm = int(ff[4]) gaps = int(ff[5]) qstart = int(ff[6]) qend = int(ff[7]) hstart = int(ff[8]) hend = int(ff[9]) exval = float(ff[10]) score = float(ff[11]) feat = Feature() feat.qid = qid feat.qstart = qstart feat.qend = qend feat.hid = hid feat.hstart = hstart feat.hend = hend feat.pid = pid feat.score = score feat.mm = mm feat.gaps = gaps feat.exval = exval if len(ff) > 15: feat.qlen = int(ff[12]) feat.hlen = int(ff[13]) feat.qseq = ff[14] feat.hseq = ff[15] if not qid in hits: hits[qid] = [] tmp = hits[qid] tmp.append(feat) return hits
def parseLine(self, line): line = line.rstrip('\n') ff = line.split('\t') ##score name1 strand1 size1 zstart1 end1 name2 strand2 size2 zstart2 end2 identity idPct coverage covPct #12413 98004798 + 1579 278 1520 F27C8.1 - 1482 200 1455 709/1185 59.8% 1255/1482 84.7% #15213 98029119 + 1752 526 1572 F27C8.1 - 1482 365 1415 615/1014 60.7% 1050/1482 70.9% f = Feature() qstrand = ff[2] hstrand = ff[7] qid = ff[1] qlen = int(ff[3]) qstart = int(ff[4]) qend = int(ff[5]) hid = ff[6] hlen = int(ff[8]) hstart = int(ff[9]) hend = int(ff[10]) f.qid = qid f.type1 = 'lastz' f.type2 = 'lastz' f.qstart = qstart f.qend = qend f.hid = hid f.hstart = hstart f.hend = hend f.score = int(ff[0]) f.qlen = qlen f.hlen = hlen pid = ff[12].replace('%', '') cov = ff[14].replace('%', '') f.hitattr['pid'] = float(pid) f.hitattr['cov'] = float(cov) if qstrand == "+" and hstrand == "+": strand = 1 elif qstrand == "+" and hstrand == "-": strand = -1 elif qstrand == "-" and hstrand == "+": strand = -1 elif qstrand == "-" and hstrand == "-": strand = 1 return f
def parseLine(self,line): line = line.rstrip('\n') ff = line.split('\t') #chr1 unknown CDS 3054734 3054733 . + -1 gene_id "ENSMUSG00000090025"; gene_name "ENSMUSG00000090025"; transcript_id "ENSMUST00000160944"; f = Feature() f.qid = ff[0] f.type1 = ff[1] f.type2 = ff[2] f.qstart = int(ff[3]) f.qend = int(ff[4]) if ff[5] != ".": f.score = double(ff[5]) if ff[6] != ".": if ff[6] == "+": f.strand = 1 elif ff[6] == 1: f.strand = 1 elif ff[6] == "-": f.strand = -1 elif ff[6] == -1: f.strand = -1 if ff[7] != ".": f.phase = int(ff[7]) featf = ff[8].split(';') for feat in featf: feat = feat.strip() tmp = feat.split(' ') if len(tmp) == 2: key = tmp[0].strip() val = tmp[1].strip() val = val.strip('"') f.hitattr[key] = val if key == "transcript_id": f.hid = val return f
def createAlignmentGFF(self, id1, id2, rstart, rend, qstart, qend, rstrand, qstrand, tmprseq, tmpqseq, insertpos): seq1 = {} seq2 = {} seq1['id'] = id1 seq2['id'] = id2 seq1['seq'] = tmprseq seq2['seq'] = tmpqseq tmpgff = Feature() tmpgff.qid = id1 tmpgff.qstart = rstart tmpgff.qend = rend tmpgff.hitattr['qseq'] = seq1 tmpgff.hitattr['hseq'] = seq2 tmpgff.hitattr['hid'] = id1 tmpgff.hitattr['insertpos'] = insertpos return tmpgff
def main(args): logging.info(" ========> Converting mummer delta format for %s %s %s" % (args.deltafile, args.reffile, args.queryfile)) logging.info("ARGS %s" % args) ref = FastaFile(args.reffile) qry = FastaFile(args.queryfile) gff = GFFFactory(args.gfffile) g = gff.nextGFF() gffs = {} while g is not None: if g.type2 == "CDS": #print "QID %s %s"%(g.qid,g.type2) if g.qid not in gffs: gffs[g.qid] = [] gffs[g.qid].append(g) g = gff.nextGFF() refseqs = {} qryseqs = {} seq = ref.nextSeq() while seq is not None: refseqs[seq['id']] = seq seq = ref.nextSeq() seq = qry.nextSeq() while seq is not None: qryseqs[seq['id']] = seq seq = qry.nextSeq() fh = open(args.deltafile) alns = {} lnum = 0 line = fh.readline() id1 = None id2 = None len1 = None len2 = None while line != "": # Can't use for line in fh: because we read the alignment in chunks lnum = lnum + 1 line = line.rstrip('\n') ff = line.split(' ') if lnum == 1: """ The first line lists the two original input files separated by a space.""" if1 = ff[0] if2 = ff[1] print "Input files [%s][%s]\n" % (if1, if2) elif lnum == 2: """ The second line specifies the alignment data type, either NUCMER or "PROMER""" alntype = ff[0] if alntype != "NUCMER": raise Exception( "Only NUCMER alignments are currently parsed - we have [%s]" % alntyp) else: """ Every grouping of alignment regions have a header, just like the cluster's header in the .cluster file. This is a FASTA style header and lists the two sequences that produced the following alignments after a '>' and separated by a space. After the two sequences are the lengths of those sequences in the same order. An example header might look like: >tagA1 tagB1 500 2000000 """ if ff[0].startswith(">"): id1 = ff[0].replace(">", '') id2 = ff[1] len1 = int(ff[2]) len2 = int(ff[3]) #print "IDs %s %s %d %d"%(id1,id2,len1,len2) else: #print "Parsing %s"%line """ The four digits are the start and end in the reference sequence respectively and the start and end in the query sequence respectively. These coordinates are always measured in DNA bases regardless of the alignment data type. The three digits after the starts and stops are: the number of errors (non-identities), similarity errors (non- positive match scores) non-alpha characters in the sequence (used to count stop-codons i promer data). An example header might look like: 5198 22885 5389 23089 20 20 0 """ rstart = int(ff[0]) rend = int(ff[1]) qstart = int(ff[2]) qend = int(ff[3]) qstrand = 1 hstrand = 1 if rend < rstart: qstrand = -1 else: qstrand = 1 if qend < qstart: hstrand = -1 else: hstrand = 1 #print "Strands %d %d"%(qstrand,hstrand) errors = int(ff[4]) simerrs = int(ff[5]) nonalpha = int(ff[6]) if id1 not in refseqs: raise Exception( "Can't find reference sequence [%s] in ref file [%s]" % (id1, args.reffile)) if id2 not in qryseqs: raise Exception( "Can't find query sequence [%s] in query file [%s]" % (id2, args.queryfile)) rseq = refseqs[id1] qseq = qryseqs[id2] #print "Found alignment header %s %d %d :: %s %d %d"%(id1,rstart,rend,id2,qstart,qend) """ Each of these headers is followed by a string of signed digits, one per line, with the final line before the next header equaling 0 (zero). Each digit represents the distance to the next insertion in the reference (positive int) or deletion in the reference (negative int), as measured in DNA bases or amino acids depending on the alignment data type. For example, with 'nucmer' the delta sequence (1, -3, 4, 0) would represent - an insertion at positions 1 and 7 in the reference sequence and - an insertion at position 3 in the query sequence. Or with letters: A = acgtagctgag$ B = cggtagtgag$ Delta = (1, -3, 4, 0) A = acg.tagctgag$ B = .cggtag.tgag$ """ count = fh.readline() count = count.rstrip('\n') count = int(count) tmprseq = rseq['seq'] tmpqseq = qseq['seq'] if rend > rstart: tmprseq = tmprseq[rstart - 1:rend - 1] else: tmprseq = tmprseq[rend:rstart] tmprseq = reverse_complement(tmprseq) if qend > qstart: tmpqseq = tmpqseq[qstart - 1:qend - 1] else: tmpqseq = tmpqseq[qend:qstart] tmpqseq = reverse_complement(tmpqseq) insertpos = 0 while count != 0: if count < 0: """ This is an insertion in the query sequence so we put a - in the ref""" insertpos = insertpos + abs(count) tmprseq = tmprseq[:insertpos - 1] + "-" + tmprseq[insertpos - 1:] elif count > 0: """ This is an insertion in the reference sequence """ insertpos = insertpos + abs(count) tmpqseq = tmpqseq[:insertpos - 1] + "-" + tmpqseq[insertpos - 1:] count = fh.readline() count = count.rstrip('\n') count = int(count) seq1 = {} seq2 = {} seq1['id'] = id1 seq2['id'] = id2 seq1['seq'] = tmprseq seq2['seq'] = tmpqseq if (seq1 != seq2 and id1 == "GG739696.1"): print prettyPrint([seq1, seq2]) if id1 not in alns: alns[id1] = [] tmpgff = Feature() tmpgff.qid = id1 tmpgff.qstart = rstart tmpgff.qend = rend #print "Strand %d %d"%(qstrand,hstrand) tmpgff.hitattr['qseq'] = seq1 tmpgff.hitattr['hseq'] = seq2 tmpgff.hitattr['hid'] = id1 tmpgff.hitattr['insertpos'] = insertpos #alns[id1].append([seq1,seq2]) alns[id1].append(tmpgff) line = fh.readline() #for id in alns: #print id #for gff in alns[id]: #print "%s - %s"%( tmpgff.qid,tmpgff.hitattr['hid']) gnum = 1 for id in gffs: for g in gffs[id]: outstr = [] name = g.hitattr['Name'] prod = g.hitattr['product'] #for h in g.hitattr: # print "%s %s"%(h,g.hitattr[h]) found = False foundgff = None status = "NEW" if id in alns: for tmpgff in alns[id]: if g.overlaps(tmpgff): if tmpgff.contains(g): #print "Contained Seq qstart/end %d %d"%(tmpgff.qstart,tmpgff.qend) found = True foundgff = tmpgff else: ostart = g.qstart oend = g.qend if tmpgff.qstart > g.qstart: ostart = tmpgff.qstart if tmpgff.qend < g.qend: oend = tmpgff.qend frac = int(100 * (oend - ostart + 1) / (g.qend - g.qstart + 1)) status = "PARTALIGN" outstr.append( "============1 Processing gene %d %s %s" % (gnum, name, prod)) outstr.append( "Contig coords from gff file %s %d-%d" % (g.qid, g.qstart, g.qend)) outstr.append( "Partial overlap of %d percent overlap coords are %d %d" % (frac, ostart, oend)) if not found: if status == "NEW": status = "NOALIGN" outstr.append("============2 Processing gene %d %s %s" % (gnum, name, prod)) outstr.append( "Contig coords from gff file %s %d-%d %s %s" % (g.qid, g.qstart, g.qend, name, prod)) outstr.append( "ERROR: No align for %s %s qstart/end %d %d %s" % (name, tmpgff.qid, tmpgff.qstart, tmpgff.qend, prod)) else: if qstrand == -1: status = "REVSTRAND" outstr.append("===========3 Processing gene %d %s %s" % (gnum, name, prod)) outstr.append( "Contig coords from gff file %s %d-%d %s %s" % (g.qid, g.qstart, g.qend, name, prod)) outstr.append( "ERROR: can't deal with reverse strand reference alignments" ) else: gstrand = g.strand gstart = g.qstart gend = g.qend astrand = foundgff.strand astart = foundgff.qstart aend = foundgff.qend apos1 = findAlnPos(foundgff, gstart) apos2 = findAlnPos(foundgff, gend) if gstrand == 1: qseq = foundgff.hitattr['qseq']['seq'][apos1:apos2] hseq = foundgff.hitattr['hseq']['seq'][apos1:apos2] else: qseq = foundgff.hitattr['qseq']['seq'][apos1 + 1:apos2 + 1] hseq = foundgff.hitattr['hseq']['seq'][apos1 + 1:apos2 + 1] qseq = reverse_complement(qseq) hseq = reverse_complement(hseq) if qseq != hseq: status = "MUTATION" outstr.append("===========4 Processing gene %d %s %s" % (gnum, name, prod)) #print "GFF %s %s %d %d %s %s"%(g.qid,g.hid,g.qstart,g.qend,name,prod) outstr.append("DNA alignment\n") tmpstr = prettyPrint([{ 'id': id1, 'seq': qseq }, { 'id': id2, 'seq': hseq }]) tmpff = tmpstr.split('\n') for f in tmpff: outstr.append(f) qpep = translate(qseq) hpep = translate(hseq) tmpstr = prettyPrint([{ 'id': id1, 'seq': qpep }, { 'id': id2, 'seq': hpep }]) outstr.append("PEP alignment\n") tmpff = tmpstr.split('\n') for f in tmpff: outstr.append(f) #print "GFF start-end strand %d-%d %d %s %s"%(gstart,gend,gstrand,name,prod) #print "ALN start-end strand %d-%d %d %s %s"%(astart,aend,astrand,name,prod) #print "POS %d %d",(apos1,apos2) #print "QSEQ %s"%qseq #print "HSEQ %s"%hseq #print "QPEP %s"%qpep #print "HPEP %s"%hpep else: status = "IDENTICAL" outstr.append( "============5 Processing gene %d %s %s" % (gnum, name, prod)) outstr.append("NO CHANGE for this alignment %s %s %s" % (tmpgff.qid, name, prod)) for i in outstr: print "%-15s %s" % (status, i) print "\n" gnum = gnum + 1
def nextGFF(self): for line in self.fh: if line is None: return if re.search('^##FASTA', line): return None if re.search('^#', line): continue line = line.rstrip('\n') ff = line.split('\t') ##gff-version 3 #!gff-spec-version 1.20 #!processor NCBI annotwriter #!genome-build ASM72083v1 #!genome-build-accession NCBI_Assembly:GCF_000720835.1 ##sequence-region NZ_JODT01000001.1 1 388890 ##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=67256 #NZ_JODT01000001.1 RefSeq region 1 388890 . + . ID=id0;Dbxref=taxon:67256;collection-date=2010;country=Japan: Suginami%2C Tokyo;culture-collection=NRRL:B-2120;gbkey=Src;isolation-source=garden soil;mol_type=genomic DNA;strain=NRRL B-2120;sub-species=achromogenes #NZ_JODT01000001.1 RefSeq gene 283 1188 . - . ID=gene0;Name=IH25_RS0100010;gbkey=Gene;locus_tag=IH25_RS0100010 #NZ_JODT01000001.1 Protein Homology CDS 283 1188 . - 0 ID=cds0;Parent=gene0;Dbxref=Genbank:WP_030600633.1;Name=WP_030600633.1;gbkey=CDS;product=DeoR faimly transcriptional regulator;protein_id=WP_030600633.1;transl_table=11 #NZ_JODT01000001.1 RefSeq gene 1391 2839 . - . ID=gene1;Name=IH25_RS0100015;gbkey=Gene;locus_tag=IH25_RS0100015 #NZ_JODT01000001.1 Protein Homology CDS 1391 2839 . - 0 ID=cds1;Parent=gene1;Dbxref=Genbank:WP_030600636.1;Name=WP_030600636.1;Note=catalyzes the reduction of nonspecific electron acceptors such as 2%2C6-dimethyl-1%2C4-benzoquinone and 5-hydroxy-1%2C4-naphthaquinone%3B does not have lipoamide dehydrogenase activity;gbkey=CDS;product=flavoprotein disulfide reductase;protein_id=WP_030600636.1;transl_table=11 #NZ_JODT01000001.1 RefSeq gene 2936 3373 . + . ID=gene2;Name=IH25_RS0100020;gbkey=Gene;locus_tag=IH25_RS0100020 #NZ_JODT01000001.1 Protein Homology CDS 2936 3373 . + 0 ID=cds2;Parent=gene2;Dbxref=Genbank:WP_030600640.1;Name=WP_030600640.1;gbkey=CDS;product=gamma-glutamyl cyclotransferase;protein_id=WP_030600640.1;transl_table=11 #NZ_JODT01000001.1 RefSeq gene 3499 4323 . + . ID=gene3;Name=IH25_RS0100025;gbkey=Gene;locus_tag=IH25_RS0100025 #NZ_JODT01000001.1 Protein Homology CDS 3499 4323 . + 0 ID=cds3;Parent=gene3;Dbxref=Genbank:WP_03060 if len(ff) < 8: raise Exception( "GFF line needs 8 or more fields to parse [%s]" % line) f = Feature() f.qid = ff[0] f.type1 = ff[1] f.type2 = ff[2] f.qstart = int(ff[3]) f.qend = int(ff[4]) f.score = ff[5] f.strand = ff[6] f.phase = ff[7] if f.score == ".": f.score = 0 else: f.score = int(f.score) if f.strand == "+": f.strand = 1 if f.strand == "-": f.strand = -1 if f.strand == ".": f.strand = 0 if len(ff) > 8: hidstr = ff[8] hitattr = {} hffarr = hidstr.split(';') for hff in hffarr: tmparr = hff.split('=') hitattr[tmparr[0]] = tmparr[1] f.hitattr = hitattr return f
qid = fields[17] hid = fields[18] strand = 1 if hend < hstart: strand = -1 tmp = hend hend = hstart hstart = tmp tmpgff = Feature() tmpgff.qid = qid tmpgff.qstart = qstart tmpgff.qend = qend tmpgff.qlen = qlen tmpgff.qcov = qcov tmpgff.hitattr['hid'] = hid tmpgff.hitattr['hstart'] = hstart tmpgff.hitattr['hend'] = hend tmpgff.hitattr['hlen'] = hlen tmpgff.hitattr['hcov'] = hcov tmpgff.pid = pid tmpgff.strand = strand if qid == "10358": print(