def postProcessOutput(self):

        super(BlastOutput6ParserAnalysis,self).postProcessOutput()

        data   = {}

        file   = self.input_files[0]

        with open(file) as fp:

            for line in fp:

                line = line.rstrip('\n')
                ff   = line.split('\t')

                qid = ff[0]
                hid = ff[1]
                pid = float(ff[2])
                alnlen = ff[3]
                mm     = int(ff[4])
                gaps   = int(ff[5])
                qstart = int(ff[6])
                qend   = int(ff[7])
                hstart = int(ff[8])
                hend   = int(ff[9])
                exval   = float(ff[10])
                score  = float(ff[11])

                feat = Feature()

                feat.qid = qid
                feat.qstart = qstart
                feat.qend   = qend
                feat.hid    = hid
                feat.hstart = hstart
                feat.hend   = hend

                feat.pid = pid
                feat.score = score
                
                feat.mm = mm
                feat.gaps = gaps
                feat.exval = exval

                if len(ff) > 12:
                    feat.qlen = int(ff[12])
                    feat.hlen = int(ff[13])
                    feat.qseq = ff[14]
                    feat.hseq = ff[15]

                if not qid in data:
                    data[qid] = []

                tmp = data[qid]

                tmp.append(feat)

            self.data = data
Example #2
0
    def parseLine(self,line):
        line = line.rstrip('\n')
        ff = line.split('\t')

        #psLayout version 3

        #match	mis- 	rep. 	N's	Q gap	Q gap	T gap	T gap	strand	Q        	Q   	Q    	Q  	T        	T   	T    	T  	block	blockSizes 	qStart	 tStarts
     	#match	match	   	count	bases	count	bases	      	name     	size	start	end	name     	size	start	end	count
        #---------------------------------------------------------------------------------------------------------------------------------------------------------------
        #236	0	0	0	0	0	0	0	+	TRINITY_DN4669_c0_g1_i1	237	0	236	Gm16144_ENSMUST00000131093	1843	1272	1508	1	236,	0,	1272,
        #179	0	0	0	0	0	0	0	-	TRINITY_DN4615_c0_g1_i1	317	0	179	Hdhd3_ENSMUST00000037820	2977	0	179	1	179,	138,	0,
        #183	0	0	0	0	0	0	0	+	TRINITY_DN4601_c0_g1_i1	219	36	219	Atp6v1a_ENSMUST00000130036	40052	2211	2394	1

            
        f = Feature()

        match  = int(ff[0])
        mismatch = int(ff[1])

        strand = ff[8]
        qid    = ff[9]
        qlen   = int(ff[10])
        qstart = int(ff[11])
        qend   = int(ff[12])
        hid    = ff[13]
        hlen   = int(ff[14])
        hstart = int(ff[15])
        hend   = int(ff[16])

        f.qid    = qid
        f.type1  = 'blat'
        f.type2  = 'blat'
        f.qstart = qstart
        f.qend   = qend

        f.hid    = hid
        f.hstart = hstart
        f.hend   = hend

        f.score = int(100*match/qlen)

        f.qlen  = qlen
        f.hlen  = hlen

        f.hitattr['match'] = match
        f.hitattr['mismatch'] = mismatch

        if strand != ".":
            if strand == "+":
                f.strand = 1
            elif strand == 1:
                f.strand = 1
            elif strand == "-":
                f.strand = -1
            elif strand  == -1:
                f.strand = -1

        return f
    def parseBlastOutput6(file):
 
      hits = {}

      prev = None

      with open(file) as fp:

         for line in fp:
             
             line = line.rstrip('\n')

             ff   = line.split('\t')

             qid = ff[0]
	     hid = ff[1]
             pid = float(ff[2])
             alnlen = ff[3]
             mm     = int(ff[4])
             gaps   = int(ff[5])
             qstart = int(ff[6])
             qend   = int(ff[7])
             hstart = int(ff[8])
             hend   = int(ff[9])
             exval   = float(ff[10])
             score  = float(ff[11])

             feat = Feature()

             feat.qid = qid
             feat.qstart = qstart
             feat.qend   = qend
             feat.hid    = hid
             feat.hstart = hstart
             feat.hend   = hend

             feat.pid = pid
             feat.score = score
                
             feat.mm = mm
             feat.gaps = gaps
             feat.exval = exval

             if len(ff) > 15:
                 feat.qlen = int(ff[12])
                 feat.hlen = int(ff[13])
                 feat.qseq = ff[14]
                 feat.hseq = ff[15]

             if not qid in hits:
                 hits[qid] = []

             tmp = hits[qid]
             tmp.append(feat)

      return hits 
Example #4
0
    def parseLine(self, line):

        line = line.rstrip('\n')
        ff = line.split('\t')

        ##score  name1   strand1 size1   zstart1 end1    name2   strand2 size2   zstart2 end2    identity        idPct   coverage        covPct
        #12413   98004798        +       1579    278     1520    F27C8.1 -       1482    200     1455    709/1185        59.8%   1255/1482       84.7%
        #15213   98029119        +       1752    526     1572    F27C8.1 -       1482    365     1415    615/1014        60.7%   1050/1482       70.9%

        f = Feature()

        qstrand = ff[2]
        hstrand = ff[7]

        qid = ff[1]
        qlen = int(ff[3])
        qstart = int(ff[4])
        qend = int(ff[5])
        hid = ff[6]
        hlen = int(ff[8])
        hstart = int(ff[9])
        hend = int(ff[10])

        f.qid = qid
        f.type1 = 'lastz'
        f.type2 = 'lastz'
        f.qstart = qstart
        f.qend = qend

        f.hid = hid
        f.hstart = hstart
        f.hend = hend

        f.score = int(ff[0])

        f.qlen = qlen
        f.hlen = hlen

        pid = ff[12].replace('%', '')
        cov = ff[14].replace('%', '')

        f.hitattr['pid'] = float(pid)
        f.hitattr['cov'] = float(cov)

        if qstrand == "+" and hstrand == "+":
            strand = 1
        elif qstrand == "+" and hstrand == "-":
            strand = -1
        elif qstrand == "-" and hstrand == "+":
            strand = -1
        elif qstrand == "-" and hstrand == "-":
            strand = 1

        return f
Example #5
0
    def parseLine(self,line):
        line = line.rstrip('\n')
        ff = line.split('\t')

        #chr1	unknown	CDS	3054734	3054733	.	+	-1	gene_id "ENSMUSG00000090025"; gene_name "ENSMUSG00000090025"; transcript_id "ENSMUST00000160944";

        f = Feature()

        f.qid   = ff[0]
        f.type1 = ff[1]
        f.type2 = ff[2]

        f.qstart = int(ff[3])
        f.qend   = int(ff[4])

        if ff[5] != ".":
            f.score = double(ff[5])

        if ff[6] != ".":
            if ff[6] == "+":
                f.strand = 1
            elif ff[6] == 1:
                f.strand = 1
            elif ff[6] == "-":
                f.strand = -1
            elif ff[6] == -1:
                f.strand = -1

        if ff[7] != ".":
            f.phase = int(ff[7])


        featf = ff[8].split(';')
            
        for feat in featf:
            feat = feat.strip()
            tmp  = feat.split(' ')

            if len(tmp) == 2:

                key = tmp[0].strip()
                val = tmp[1].strip()
                val = val.strip('"')

                f.hitattr[key] = val

                if key == "transcript_id":
                    f.hid = val
        return f
Example #6
0
    def nextGFF(self):

        for line in self.fh:

            if line is None:
                return

            if re.search('^##FASTA', line):
                return None

            if re.search('^#', line):
                continue

            line = line.rstrip('\n')
            ff = line.split('\t')

            ##gff-version 3
            #!gff-spec-version 1.20
            #!processor NCBI annotwriter
            #!genome-build ASM72083v1
            #!genome-build-accession NCBI_Assembly:GCF_000720835.1
            ##sequence-region NZ_JODT01000001.1 1 388890
            ##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=67256
            #NZ_JODT01000001.1       RefSeq  region  1       388890  .       +       .       ID=id0;Dbxref=taxon:67256;collection-date=2010;country=Japan: Suginami%2C Tokyo;culture-collection=NRRL:B-2120;gbkey=Src;isolation-source=garden soil;mol_type=genomic DNA;strain=NRRL B-2120;sub-species=achromogenes
            #NZ_JODT01000001.1       RefSeq  gene    283     1188    .       -       .       ID=gene0;Name=IH25_RS0100010;gbkey=Gene;locus_tag=IH25_RS0100010
            #NZ_JODT01000001.1       Protein Homology        CDS     283     1188    .       -       0       ID=cds0;Parent=gene0;Dbxref=Genbank:WP_030600633.1;Name=WP_030600633.1;gbkey=CDS;product=DeoR faimly transcriptional regulator;protein_id=WP_030600633.1;transl_table=11
            #NZ_JODT01000001.1       RefSeq  gene    1391    2839    .       -       .       ID=gene1;Name=IH25_RS0100015;gbkey=Gene;locus_tag=IH25_RS0100015
            #NZ_JODT01000001.1       Protein Homology        CDS     1391    2839    .       -       0       ID=cds1;Parent=gene1;Dbxref=Genbank:WP_030600636.1;Name=WP_030600636.1;Note=catalyzes the reduction of nonspecific electron acceptors such as 2%2C6-dimethyl-1%2C4-benzoquinone and 5-hydroxy-1%2C4-naphthaquinone%3B does not have lipoamide dehydrogenase activity;gbkey=CDS;product=flavoprotein disulfide reductase;protein_id=WP_030600636.1;transl_table=11
            #NZ_JODT01000001.1       RefSeq  gene    2936    3373    .       +       .       ID=gene2;Name=IH25_RS0100020;gbkey=Gene;locus_tag=IH25_RS0100020
            #NZ_JODT01000001.1       Protein Homology        CDS     2936    3373    .       +       0       ID=cds2;Parent=gene2;Dbxref=Genbank:WP_030600640.1;Name=WP_030600640.1;gbkey=CDS;product=gamma-glutamyl cyclotransferase;protein_id=WP_030600640.1;transl_table=11
            #NZ_JODT01000001.1       RefSeq  gene    3499    4323    .       +       .       ID=gene3;Name=IH25_RS0100025;gbkey=Gene;locus_tag=IH25_RS0100025
            #NZ_JODT01000001.1       Protein Homology        CDS     3499    4323    .       +       0       ID=cds3;Parent=gene3;Dbxref=Genbank:WP_03060

            if len(ff) < 8:
                raise Exception(
                    "GFF line needs 8 or more fields to parse [%s]" % line)

            f = Feature()

            f.qid = ff[0]
            f.type1 = ff[1]
            f.type2 = ff[2]
            f.qstart = int(ff[3])
            f.qend = int(ff[4])
            f.score = ff[5]
            f.strand = ff[6]
            f.phase = ff[7]

            if f.score == ".":
                f.score = 0
            else:
                f.score = int(f.score)

            if f.strand == "+":
                f.strand = 1

            if f.strand == "-":
                f.strand = -1

            if f.strand == ".":
                f.strand = 0

            if len(ff) > 8:

                hidstr = ff[8]
                hitattr = {}

                hffarr = hidstr.split(';')

                for hff in hffarr:

                    tmparr = hff.split('=')

                    hitattr[tmparr[0]] = tmparr[1]

                f.hitattr = hitattr

            return f