def postProcessOutput(self): super(BlastOutput6ParserAnalysis,self).postProcessOutput() data = {} file = self.input_files[0] with open(file) as fp: for line in fp: line = line.rstrip('\n') ff = line.split('\t') qid = ff[0] hid = ff[1] pid = float(ff[2]) alnlen = ff[3] mm = int(ff[4]) gaps = int(ff[5]) qstart = int(ff[6]) qend = int(ff[7]) hstart = int(ff[8]) hend = int(ff[9]) exval = float(ff[10]) score = float(ff[11]) feat = Feature() feat.qid = qid feat.qstart = qstart feat.qend = qend feat.hid = hid feat.hstart = hstart feat.hend = hend feat.pid = pid feat.score = score feat.mm = mm feat.gaps = gaps feat.exval = exval if len(ff) > 12: feat.qlen = int(ff[12]) feat.hlen = int(ff[13]) feat.qseq = ff[14] feat.hseq = ff[15] if not qid in data: data[qid] = [] tmp = data[qid] tmp.append(feat) self.data = data
def parseLine(self,line): line = line.rstrip('\n') ff = line.split('\t') #psLayout version 3 #match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStart tStarts #match match count bases count bases name size start end name size start end count #--------------------------------------------------------------------------------------------------------------------------------------------------------------- #236 0 0 0 0 0 0 0 + TRINITY_DN4669_c0_g1_i1 237 0 236 Gm16144_ENSMUST00000131093 1843 1272 1508 1 236, 0, 1272, #179 0 0 0 0 0 0 0 - TRINITY_DN4615_c0_g1_i1 317 0 179 Hdhd3_ENSMUST00000037820 2977 0 179 1 179, 138, 0, #183 0 0 0 0 0 0 0 + TRINITY_DN4601_c0_g1_i1 219 36 219 Atp6v1a_ENSMUST00000130036 40052 2211 2394 1 f = Feature() match = int(ff[0]) mismatch = int(ff[1]) strand = ff[8] qid = ff[9] qlen = int(ff[10]) qstart = int(ff[11]) qend = int(ff[12]) hid = ff[13] hlen = int(ff[14]) hstart = int(ff[15]) hend = int(ff[16]) f.qid = qid f.type1 = 'blat' f.type2 = 'blat' f.qstart = qstart f.qend = qend f.hid = hid f.hstart = hstart f.hend = hend f.score = int(100*match/qlen) f.qlen = qlen f.hlen = hlen f.hitattr['match'] = match f.hitattr['mismatch'] = mismatch if strand != ".": if strand == "+": f.strand = 1 elif strand == 1: f.strand = 1 elif strand == "-": f.strand = -1 elif strand == -1: f.strand = -1 return f
def parseBlastOutput6(file): hits = {} prev = None with open(file) as fp: for line in fp: line = line.rstrip('\n') ff = line.split('\t') qid = ff[0] hid = ff[1] pid = float(ff[2]) alnlen = ff[3] mm = int(ff[4]) gaps = int(ff[5]) qstart = int(ff[6]) qend = int(ff[7]) hstart = int(ff[8]) hend = int(ff[9]) exval = float(ff[10]) score = float(ff[11]) feat = Feature() feat.qid = qid feat.qstart = qstart feat.qend = qend feat.hid = hid feat.hstart = hstart feat.hend = hend feat.pid = pid feat.score = score feat.mm = mm feat.gaps = gaps feat.exval = exval if len(ff) > 15: feat.qlen = int(ff[12]) feat.hlen = int(ff[13]) feat.qseq = ff[14] feat.hseq = ff[15] if not qid in hits: hits[qid] = [] tmp = hits[qid] tmp.append(feat) return hits
def parseLine(self, line): line = line.rstrip('\n') ff = line.split('\t') ##score name1 strand1 size1 zstart1 end1 name2 strand2 size2 zstart2 end2 identity idPct coverage covPct #12413 98004798 + 1579 278 1520 F27C8.1 - 1482 200 1455 709/1185 59.8% 1255/1482 84.7% #15213 98029119 + 1752 526 1572 F27C8.1 - 1482 365 1415 615/1014 60.7% 1050/1482 70.9% f = Feature() qstrand = ff[2] hstrand = ff[7] qid = ff[1] qlen = int(ff[3]) qstart = int(ff[4]) qend = int(ff[5]) hid = ff[6] hlen = int(ff[8]) hstart = int(ff[9]) hend = int(ff[10]) f.qid = qid f.type1 = 'lastz' f.type2 = 'lastz' f.qstart = qstart f.qend = qend f.hid = hid f.hstart = hstart f.hend = hend f.score = int(ff[0]) f.qlen = qlen f.hlen = hlen pid = ff[12].replace('%', '') cov = ff[14].replace('%', '') f.hitattr['pid'] = float(pid) f.hitattr['cov'] = float(cov) if qstrand == "+" and hstrand == "+": strand = 1 elif qstrand == "+" and hstrand == "-": strand = -1 elif qstrand == "-" and hstrand == "+": strand = -1 elif qstrand == "-" and hstrand == "-": strand = 1 return f
def parseLine(self,line): line = line.rstrip('\n') ff = line.split('\t') #chr1 unknown CDS 3054734 3054733 . + -1 gene_id "ENSMUSG00000090025"; gene_name "ENSMUSG00000090025"; transcript_id "ENSMUST00000160944"; f = Feature() f.qid = ff[0] f.type1 = ff[1] f.type2 = ff[2] f.qstart = int(ff[3]) f.qend = int(ff[4]) if ff[5] != ".": f.score = double(ff[5]) if ff[6] != ".": if ff[6] == "+": f.strand = 1 elif ff[6] == 1: f.strand = 1 elif ff[6] == "-": f.strand = -1 elif ff[6] == -1: f.strand = -1 if ff[7] != ".": f.phase = int(ff[7]) featf = ff[8].split(';') for feat in featf: feat = feat.strip() tmp = feat.split(' ') if len(tmp) == 2: key = tmp[0].strip() val = tmp[1].strip() val = val.strip('"') f.hitattr[key] = val if key == "transcript_id": f.hid = val return f
def nextGFF(self): for line in self.fh: if line is None: return if re.search('^##FASTA', line): return None if re.search('^#', line): continue line = line.rstrip('\n') ff = line.split('\t') ##gff-version 3 #!gff-spec-version 1.20 #!processor NCBI annotwriter #!genome-build ASM72083v1 #!genome-build-accession NCBI_Assembly:GCF_000720835.1 ##sequence-region NZ_JODT01000001.1 1 388890 ##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=67256 #NZ_JODT01000001.1 RefSeq region 1 388890 . + . ID=id0;Dbxref=taxon:67256;collection-date=2010;country=Japan: Suginami%2C Tokyo;culture-collection=NRRL:B-2120;gbkey=Src;isolation-source=garden soil;mol_type=genomic DNA;strain=NRRL B-2120;sub-species=achromogenes #NZ_JODT01000001.1 RefSeq gene 283 1188 . - . ID=gene0;Name=IH25_RS0100010;gbkey=Gene;locus_tag=IH25_RS0100010 #NZ_JODT01000001.1 Protein Homology CDS 283 1188 . - 0 ID=cds0;Parent=gene0;Dbxref=Genbank:WP_030600633.1;Name=WP_030600633.1;gbkey=CDS;product=DeoR faimly transcriptional regulator;protein_id=WP_030600633.1;transl_table=11 #NZ_JODT01000001.1 RefSeq gene 1391 2839 . - . ID=gene1;Name=IH25_RS0100015;gbkey=Gene;locus_tag=IH25_RS0100015 #NZ_JODT01000001.1 Protein Homology CDS 1391 2839 . - 0 ID=cds1;Parent=gene1;Dbxref=Genbank:WP_030600636.1;Name=WP_030600636.1;Note=catalyzes the reduction of nonspecific electron acceptors such as 2%2C6-dimethyl-1%2C4-benzoquinone and 5-hydroxy-1%2C4-naphthaquinone%3B does not have lipoamide dehydrogenase activity;gbkey=CDS;product=flavoprotein disulfide reductase;protein_id=WP_030600636.1;transl_table=11 #NZ_JODT01000001.1 RefSeq gene 2936 3373 . + . ID=gene2;Name=IH25_RS0100020;gbkey=Gene;locus_tag=IH25_RS0100020 #NZ_JODT01000001.1 Protein Homology CDS 2936 3373 . + 0 ID=cds2;Parent=gene2;Dbxref=Genbank:WP_030600640.1;Name=WP_030600640.1;gbkey=CDS;product=gamma-glutamyl cyclotransferase;protein_id=WP_030600640.1;transl_table=11 #NZ_JODT01000001.1 RefSeq gene 3499 4323 . + . ID=gene3;Name=IH25_RS0100025;gbkey=Gene;locus_tag=IH25_RS0100025 #NZ_JODT01000001.1 Protein Homology CDS 3499 4323 . + 0 ID=cds3;Parent=gene3;Dbxref=Genbank:WP_03060 if len(ff) < 8: raise Exception( "GFF line needs 8 or more fields to parse [%s]" % line) f = Feature() f.qid = ff[0] f.type1 = ff[1] f.type2 = ff[2] f.qstart = int(ff[3]) f.qend = int(ff[4]) f.score = ff[5] f.strand = ff[6] f.phase = ff[7] if f.score == ".": f.score = 0 else: f.score = int(f.score) if f.strand == "+": f.strand = 1 if f.strand == "-": f.strand = -1 if f.strand == ".": f.strand = 0 if len(ff) > 8: hidstr = ff[8] hitattr = {} hffarr = hidstr.split(';') for hff in hffarr: tmparr = hff.split('=') hitattr[tmparr[0]] = tmparr[1] f.hitattr = hitattr return f