Python Hit Examples, Bio.SearchIO._model.Hit Python Examples

Example #1

0

Show file

File: blast_xml.py Project: umbrr/biopython

    def _parse_hit(self, root_hit_elem, query_id):
        """Yield a generator object that transforms Iteration_hits XML elements into Hit objects (PRIVATE).

        :param root_hit_elem: root element of the Iteration_hits tag.
        :type root_hit_elem: XML element tag
        :param query_id: QueryResult ID of this Hit
        :type query_id: string

        """
        # Hit level processing
        # Hits are stored in the Iteration_hits tag, with the following
        # DTD
        # <!ELEMENT Hit (
        #        Hit_num,
        #        Hit_id,
        #        Hit_def,
        #        Hit_accession,
        #        Hit_len,
        #        Hit_hsps?)>

        # feed the loop below an empty list so iteration still works
        if root_hit_elem is None:
            root_hit_elem = []

        for hit_elem in root_hit_elem:

            # BLAST sometimes mangles the sequence IDs and descriptions, so we need
            # to extract the actual values.
            raw_hit_id = hit_elem.findtext('Hit_id')
            raw_hit_desc = hit_elem.findtext('Hit_def')
            if not self._use_raw_hit_ids:
                ids, descs, blast_hit_id = _extract_ids_and_descs(raw_hit_id, raw_hit_desc)
            else:
                ids, descs, blast_hit_id = [raw_hit_id], [raw_hit_desc], raw_hit_id

            hit_id, alt_hit_ids = ids[0], ids[1:]
            hit_desc, alt_hit_descs = descs[0], descs[1:]

            hsps = [hsp for hsp in
                    self._parse_hsp(hit_elem.find('Hit_hsps'),
                        query_id, hit_id)]

            hit = Hit(hsps)
            hit.description = hit_desc
            hit._id_alt = alt_hit_ids
            hit._description_alt = alt_hit_descs
            hit.blast_id = blast_hit_id

            for key, val_info in _ELEM_HIT.items():
                value = hit_elem.findtext(key)
                if value is not None:
                    caster = val_info[1]
                    # recast only if value is not intended to be str
                    if value is not None and caster is not str:
                        value = caster(value)
                    setattr(hit, val_info[0], value)

            # delete element after we finish parsing it
            hit_elem.clear()
            yield hit

Example #2

0

Show file

File: hhsuite2_text.py Project: HuttonICS/biopython

    def _create_qresult(self, hit_blocks):
        """Create the Biopython data structures from the parsed data (PRIVATE)."""
        query_id = self.query_id
        hit_dict = OrderedDict()

        for output_index, block in enumerate(hit_blocks):
            hit_id = block['hit_id']

            frag = HSPFragment(hit_id, query_id)
            frag.alphabet = generic_protein
            frag.query_start = block['query_start'] - 1
            frag.query_end = block['query_end']
            frag.hit_start = block['hit_start'] - 1
            frag.hit_end = block['hit_end']
            frag.hit = block['hit_seq']
            frag.query = block['query_seq']

            hsp = HSP([frag])
            hsp.hit_id = hit_id
            hsp.output_index = output_index
            hsp.query_id = query_id
            hsp.hit_description = block['description']
            is_included = True  # Should everything should be included?
            hsp.is_included = is_included
            hsp.evalue = block['evalue']
            hsp.score = block['score']
            hsp.prob = block['prob']

            if hit_id not in hit_dict:
                hit = Hit([hsp], hit_id)
                hit.description = block['description']
                hit.is_included = is_included
                hit.evalue = block['evalue']
                hit.score = block['score']
                hit_dict[hit_id] = hit
            else:
                hit_dict[hit_id].append(hsp)

        qresult = QueryResult(hit_dict.values(), query_id)
        qresult.program = _PROGRAM
        qresult.seq_len = self.seq_len
        return [qresult]

Example #3

0

Show file

File: hmmer2_text.py Project: DunbrackLab/biopython

 def createHit(self, hsp_list):
     hit = Hit(hsp_list)
     hit.id_ = self.id_
     hit.evalue = self.evalue
     hit.bitscore = self.bitscore
     if self.description:
         hit.description = self.description
     hit.domain_obs_num = self.domain_obs_num
     return hit

Example #4

0

Show file

File: hmmer3_domtab.py Project: davidmam/biopython

    def _parse_qresult(self):
        """Generator function that returns QueryResult objects."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # dummies for initial states
        qres_state = None
        hit_state = None
        file_state = None
        # dummies for initial id caches
        prev_qid = None
        prev_hid = None
        # dummies for initial parsed value containers
        cur, prev = None, None
        hit_list, hsp_list = [], []

        while True:
            # store previous line's parsed values, for every line after the 1st
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the line if it's not EOF
            if self.line:
                cur = self._parse_row()
                cur_qid = cur['qresult']['id']
                cur_hid = cur['hit']['id']
            else:
                file_state = state_EOF
                # mock ID values since the line is empty
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            # start creating objects after the first line (i.e. prev is filled)
            if prev is not None:
                # each line is basically an HSP with one HSPFragment
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev['frag'].items():
                    setattr(frag, attr, value)
                hsp = HSP([frag])
                for attr, value in prev['hsp'].items():
                    setattr(hsp, attr, value)
                hsp_list.append(hsp)

                # create hit object when we've finished parsing all its hsps
                # i.e. when hit state is state_HIT_NEW
                if hit_state == state_HIT_NEW:
                    hit = Hit(hsp_list)
                    for attr, value in prev['hit'].items():
                        setattr(hit, attr, value)
                    hit_list.append(hit)
                    hsp_list = []

                # create qresult and yield if we're at a new qresult or EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(prev_qid, hits=hit_list)
                    for attr, value in prev['qresult'].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if current line is EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()

Example #5

0

Show file

    def __iter__(self):
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith('>'):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(' ', 1)
            except ValueError:
                qid, qdesc = rec.query, ''
            qdesc = qdesc.replace('\n', '').replace('\r', '')

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine alphabet based on program
            if qresult.program == 'blastn':
                alphabet = generic_dna
            elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']:
                alphabet = generic_protein

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith('> '):
                    aln.title = aln.title[2:]
                elif aln.title.startswith('>'):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(' ', 1)
                except ValueError:
                    hid, hdesc = aln.title, ''
                hdesc = hdesc.replace('\n', '').replace('\r', '')

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.alphabet = alphabet
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start,
                                           bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ''
                    hseq = ''
                    midline = ''
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == ' ' or hchar == ' ':
                            assert all(' ' == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation['similarity'] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult

Example #6

0

Show file

File: test_loaders_similarity.py Project: lmb-embrapa/machado

    def test_store_bio_searchio_blast_record(self):
        """Run Tests - __init__ and store_searchio_blast_record."""
        null_db, created = Db.objects.get_or_create(name="null")
        null_cv, created = Cv.objects.get_or_create(name="null")
        null_dbxref, created = Dbxref.objects.get_or_create(accession="null",
                                                            db=null_db)
        null_cvterm, created = Cvterm.objects.get_or_create(
            name="null",
            cv=null_cv,
            dbxref=null_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        null_pub, created = Pub.objects.get_or_create(uniquename="null",
                                                      type=null_cvterm,
                                                      is_obsolete=False)

        test_organism = Organism.objects.create(genus="Mus",
                                                species="musculus")
        test_organism2, created = Organism.objects.get_or_create(
            abbreviation="multispecies",
            genus="multispecies",
            species="multispecies",
            common_name="multispecies",
        )
        # creating test SO term
        test_db = Db.objects.create(name="SO")
        test_cv = Cv.objects.create(name="sequence")
        test_db2 = Db.objects.create(name="RO")
        test_cv2 = Cv.objects.create(name="relationship")
        test_dbxref = Dbxref.objects.create(accession="123456", db=test_db)
        test_dbxref2 = Dbxref.objects.create(accession="7890", db=test_db)
        test_aa_term = Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_aa_term2 = Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref2,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref3 = Dbxref.objects.create(accession="1234567", db=test_db)
        Cvterm.objects.create(
            name="match_part",
            cv=test_cv,
            dbxref=test_dbxref3,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref4 = Dbxref.objects.create(accession="12345678", db=test_db2)
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv2,
            dbxref=test_dbxref4,
            is_obsolete=0,
            is_relationshiptype=1,
        )
        test_dbxref5 = Dbxref.objects.create(accession="12345679", db=test_db2)
        Cvterm.objects.create(
            name="in similarity relationship with",
            cv=test_cv2,
            dbxref=test_dbxref5,
            is_obsolete=0,
            is_relationshiptype=1,
        )
        test_dbxref6 = Dbxref.objects.create(accession="22345679", db=test_db2)
        cvterm_translation = Cvterm.objects.create(
            name="translation_of",
            cv=test_cv,
            dbxref=test_dbxref6,
            is_obsolete=0,
            is_relationshiptype=1,
        )
        test_dbxref7 = Dbxref.objects.create(accession="223456", db=test_db)
        test_mrna_term = Cvterm.objects.create(
            name="mRNA",
            cv=test_cv,
            dbxref=test_dbxref7,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        test_db_pfam = Db.objects.create(name="PFAM")
        test_cv_pfam = Cv.objects.create(name="PFAM")
        test_dbxref_pfam_term = Dbxref.objects.create(accession="123",
                                                      db=test_db_pfam)
        test_cvterm_pfam_term = Cvterm.objects.create(
            name="kinase",
            cv=test_cv_pfam,
            dbxref=test_dbxref_pfam_term,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # creating test features
        feature_db = Db.objects.create(name="FASTA_SOURCE")
        feature_dbxref1 = Dbxref.objects.create(db=feature_db,
                                                accession="feat1")
        feature_dbxref2 = Dbxref.objects.create(db=feature_db,
                                                accession="feat2")
        feature_dbxref3 = Dbxref.objects.create(db=feature_db,
                                                accession="feat3")
        feature_dbxref4 = Dbxref.objects.create(db=feature_db,
                                                accession="feat4")
        feature_dbxref5 = Dbxref.objects.create(db=feature_db,
                                                accession="feat5")
        feature_dbxref1m = Dbxref.objects.create(db=feature_db,
                                                 accession="feat1m")
        feature_dbxref2m = Dbxref.objects.create(db=feature_db,
                                                 accession="feat2m")
        feature_dbxref3m = Dbxref.objects.create(db=feature_db,
                                                 accession="feat3m")
        feature_dbxref4m = Dbxref.objects.create(db=feature_db,
                                                 accession="feat4m")
        feature_dbxref5m = Dbxref.objects.create(db=feature_db,
                                                 accession="feat5m")
        f1 = Feature.objects.create(
            organism=test_organism,
            uniquename="feat1",
            is_analysis=False,
            type_id=test_aa_term.cvterm_id,
            is_obsolete=False,
            dbxref=feature_dbxref1,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f2 = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat2",
            is_analysis=False,
            type_id=test_aa_term2.cvterm_id,
            is_obsolete=False,
            dbxref=feature_dbxref2,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f3 = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat3",
            is_analysis=False,
            type_id=test_aa_term2.cvterm_id,
            is_obsolete=False,
            dbxref=feature_dbxref3,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f4 = Feature.objects.create(
            organism=test_organism,
            uniquename="feat4",
            is_analysis=False,
            type_id=test_aa_term.cvterm_id,
            is_obsolete=False,
            dbxref=feature_dbxref4,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f5 = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat5",
            is_analysis=False,
            type_id=test_aa_term2.cvterm_id,
            is_obsolete=False,
            dbxref=feature_dbxref5,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f1m = Feature.objects.create(
            organism=test_organism,
            uniquename="feat1m",
            is_analysis=False,
            type=test_mrna_term,
            is_obsolete=False,
            dbxref=feature_dbxref1m,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f2m = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat2m",
            is_analysis=False,
            type=test_mrna_term,
            is_obsolete=False,
            dbxref=feature_dbxref2m,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f3m = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat3m",
            is_analysis=False,
            type=test_mrna_term,
            is_obsolete=False,
            dbxref=feature_dbxref3m,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f4m = Feature.objects.create(
            organism=test_organism,
            uniquename="feat4m",
            is_analysis=False,
            type=test_mrna_term,
            is_obsolete=False,
            dbxref=feature_dbxref4m,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        f5m = Feature.objects.create(
            organism=test_organism2,
            uniquename="feat5m",
            is_analysis=False,
            type=test_mrna_term,
            is_obsolete=False,
            dbxref=feature_dbxref5m,
            timeaccessioned=datetime.now(),
            timelastmodified=datetime.now(),
        )
        FeatureRelationship.objects.create(subject=f1m,
                                           object=f1,
                                           type=cvterm_translation,
                                           rank=0)
        FeatureRelationship.objects.create(subject=f2m,
                                           object=f2,
                                           type=cvterm_translation,
                                           rank=0)
        FeatureRelationship.objects.create(subject=f3m,
                                           object=f3,
                                           type=cvterm_translation,
                                           rank=0)
        FeatureRelationship.objects.create(subject=f4m,
                                           object=f4,
                                           type=cvterm_translation,
                                           rank=0)
        FeatureRelationship.objects.create(subject=f5m,
                                           object=f5,
                                           type=cvterm_translation,
                                           rank=0)
        FeatureCvterm.objects.create(feature=f3,
                                     cvterm=test_cvterm_pfam_term,
                                     pub=null_pub,
                                     is_not=False,
                                     rank=0)

        test_HSPFragment1 = HSPFragment("feat1", "feat2")
        setattr(test_HSPFragment1, "query_start", 110)
        setattr(test_HSPFragment1, "query_end", 1100)
        setattr(test_HSPFragment1, "aln_span", 990)
        setattr(test_HSPFragment1, "hit_start", 100)
        setattr(test_HSPFragment1, "hit_end", 1000)

        test_HSP1 = HSP([test_HSPFragment1])
        setattr(test_HSP1, "query_id", "feat1")
        setattr(test_HSP1, "hit_id", "feat2")
        setattr(test_HSP1, "bitscore", 1234.0)
        setattr(test_HSP1, "bitscore_raw", 1234)
        setattr(test_HSP1, "evalue", 0.0)
        setattr(test_HSP1, "ident_num", 82)

        test_HIT1 = Hit([test_HSP1])
        setattr(test_HIT1, "accession", "5050")
        setattr(test_HIT1, "seq_len", 2000)

        test_HSPFragment2 = HSPFragment("feat1", "feat3")
        setattr(test_HSPFragment2, "query_start", 210)
        setattr(test_HSPFragment2, "query_end", 2100)
        setattr(test_HSPFragment2, "aln_span", 1890)
        setattr(test_HSPFragment2, "hit_start", 200)
        setattr(test_HSPFragment2, "hit_end", 2000)

        test_HSP2 = HSP([test_HSPFragment2])
        setattr(test_HSP2, "query_id", "feat1")
        setattr(test_HSP2, "hit_id", "feat3")
        setattr(test_HSP2, "bitscore", 234.0)
        setattr(test_HSP2, "bitscore_raw", 234)
        setattr(test_HSP2, "evalue", 0.0)
        setattr(test_HSP2, "ident_num", 72)

        test_HIT2 = Hit([test_HSP2])
        setattr(test_HIT2, "accession", "500")
        setattr(test_HIT2, "seq_len", 4000)

        test_result1 = QueryResult([test_HIT1, test_HIT2], "feat1")
        setattr(test_result1, "seq_len", 3000)
        setattr(test_result1, "blast_id", "feat1")

        # test retrieve_query_from_hsp and retrieve_subject_from_hsp
        # test hsp with no bitscore, bitscore_raw, evalue, and ident_num
        test_HSPFragment3 = HSPFragment("feat4_desc", "feat5_desc")
        setattr(test_HSPFragment3, "query_start", 210)
        setattr(test_HSPFragment3, "query_end", 2100)
        setattr(test_HSPFragment3, "aln_span", 1890)
        setattr(test_HSPFragment3, "hit_start", 200)
        setattr(test_HSPFragment3, "hit_end", 2000)

        test_HSP3 = HSP([test_HSPFragment3])
        setattr(test_HSP3, "query_id", "feat4_desc")
        setattr(test_HSP3, "query_description", "test id=feat4")
        setattr(test_HSP3, "hit_id", "feat5_desc")
        setattr(test_HSP3, "hit_description", "test id=feat5")

        test_HIT3 = Hit([test_HSP3])
        setattr(test_HIT3, "seq_len", 4000)

        test_result2 = QueryResult([test_HIT3], "feat4_desc")
        setattr(test_result2, "seq_len", 3000)
        setattr(test_result2, "blast_id", "feat4_desc")

        # test SimilarityLoader fail
        with self.assertRaises(ImportingError):
            SimilarityLoader(
                filename="similarity.file",
                algorithm="smith-waterman",
                description="command-line example",
                program="blastp",
                input_format="blast-xml",
                programversion="2.2.31+",
                so_query="polypeptide",
                so_subject="protein_match",
                org_query="H**o sapiens",
                org_subject="multispecies multispecies",
            )

        test_blast_file = SimilarityLoader(
            filename="similarity.file",
            algorithm="smith-waterman",
            description="command-line example",
            program="interproscan",
            input_format="interproscan-xml",
            programversion="5",
            so_query="polypeptide",
            so_subject="protein_match",
            org_query="Mus musculus",
            org_subject="multispecies multispecies",
        )

        test_blast_file.store_bio_searchio_query_result(test_result1)
        test_blast_file.store_bio_searchio_query_result(test_result2)

        test_analysis = Analysis.objects.get(sourcename="similarity.file")
        self.assertEqual("interproscan", test_analysis.program)

        test_featureloc = Featureloc.objects.get(srcfeature=f3)

        test_analysisfeature = Analysisfeature.objects.get(
            analysis=test_analysis, feature_id=test_featureloc.feature_id)
        self.assertEqual(234.0, test_analysisfeature.rawscore)
        # test remove_feature
        self.assertTrue(
            Analysis.objects.filter(sourcename="similarity.file").exists())
        call_command("remove_analysis", "--name=similarity.file",
                     "--verbosity=0")
        self.assertFalse(
            Analysis.objects.filter(sourcename="similarity.file").exists())

Example #7

0

Show file

File: hmmer3_text.py Project: GJOHNSON2003/biopython

    def _create_hits(self, hit_attrs, qid, qdesc):
        """Parses a HMMER3 hsp block, beginning with the hsp table."""
        # read through until the beginning of the hsp block
        self._read_until(lambda line: line.startswith('Internal pipeline')
                         or line.startswith('>>'))

        # start parsing the hsp block
        hit_list = []
        while True:
            if self.line.startswith('Internal pipeline'):
                # by this time we should've emptied the hit attr list
                assert len(hit_attrs) == 0
                return hit_list
            assert self.line.startswith('>>')
            hid, hdesc = self.line[len('>> '):].split('  ', 1)
            hdesc = hdesc.strip()

            # read through the hsp table header and move one more line
            self._read_until(lambda line:
                    line.startswith(' ---   ------ ----- --------') or
                    line.startswith('   [No individual domains'))
            self.line = read_forward(self.handle)

            # parse the hsp table for the current hit
            hsp_list = []
            while True:
                # break out of hsp parsing if there are no hits, it's the last hsp
                # or it's the start of a new hit
                if self.line.startswith('   [No targets detected that satisfy') or \
                   self.line.startswith('   [No individual domains') or \
                   self.line.startswith('Internal pipeline statistics summary:') or \
                   self.line.startswith('  Alignments for each domain:') or \
                   self.line.startswith('>>'):

                    hit_attr = hit_attrs.pop(0)
                    hit = Hit(hsp_list)
                    for attr, value in hit_attr.items():
                        if attr == "description":
                            cur_val = getattr(hit, attr)
                            if cur_val and value and cur_val.startswith(value):
                                continue
                        setattr(hit, attr, value)
                    if not hit:
                        hit.query_description = qdesc
                    hit_list.append(hit)
                    break

                parsed = [x for x in self.line.strip().split(' ') if x]
                assert len(parsed) == 16
                # parsed column order:
                # index, is_included, bitscore, bias, evalue_cond, evalue
                # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito,
                # envfrom, envto, acc_avg
                frag = HSPFragment(hid, qid)
                # set query and hit descriptions if they are defined / nonempty string
                if qdesc:
                    frag.query_description = qdesc
                if hdesc:
                    frag.hit_description = hdesc
                # HMMER3 alphabets are always protein alphabets
                frag.alphabet = generic_protein
                # depending on whether the program is hmmsearch, hmmscan, or phmmer
                # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to}
                # for hmmscan, hit is the hmm profile, query is the sequence
                if self._meta.get('program') == 'hmmscan':
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[6]) - 1
                    frag.hit_end = int(parsed[7])
                    frag.query_start = int(parsed[9]) - 1
                    frag.query_end = int(parsed[10])
                elif self._meta.get('program') in ['hmmsearch', 'phmmer']:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[9]) - 1
                    frag.hit_end = int(parsed[10])
                    frag.query_start = int(parsed[6]) - 1
                    frag.query_end = int(parsed[7])
                # strand is always 0, since HMMER now only handles protein
                frag.hit_strand = frag.query_strand = 0

                hsp = HSP([frag])
                hsp.domain_index = int(parsed[0])
                hsp.is_included = parsed[1] == '!'
                hsp.bitscore = float(parsed[2])
                hsp.bias = float(parsed[3])
                hsp.evalue_cond = float(parsed[4])
                hsp.evalue = float(parsed[5])
                if self._meta.get('program') == 'hmmscan':
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[8]
                    hsp.query_endtype = parsed[11]
                elif self._meta.get('program') in ['hmmsearch', 'phmmer']:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[11]
                    hsp.query_endtype = parsed[8]
                # adjust 'from' and 'to' coordinates to 0-based ones
                hsp.env_start = int(parsed[12]) - 1
                hsp.env_end = int(parsed[13])
                hsp.env_endtype = parsed[14]
                hsp.acc_avg = float(parsed[15])

                hsp_list.append(hsp)
                self.line = read_forward(self.handle)

            # parse the hsp alignments
            if self.line.startswith('  Alignments for each domain:'):
                self._parse_aln_block(hid, hit.hsps)

Example #8

0

Show file

File: blast_xml.py Project: Rapternmn/biopython

    def _parse_hit(self, root_hit_elem, query_id):
        """Generator that transforms Iteration_hits XML elements into Hit objects.

        :param root_hit_elem: root element of the Iteration_hits tag.
        :type root_hit_elem: XML element tag
        :param query_id: QueryResult ID of this Hit
        :type query_id: string

        """
        # Hit level processing
        # Hits are stored in the Iteration_hits tag, with the following
        # DTD
        # <!ELEMENT Hit (
        #        Hit_num,
        #        Hit_id,
        #        Hit_def,
        #        Hit_accession,
        #        Hit_len,
        #        Hit_hsps?)>

        # feed the loop below an empty list so iteration still works
        if root_hit_elem is None:
            root_hit_elem = []

        for hit_elem in root_hit_elem:

            # create empty hit object
            hit_id = hit_elem.findtext('Hit_id')
            hit_desc = hit_elem.findtext('Hit_def')
            # handle blast searches against databases with Blast's IDs
            if hit_id.startswith('gnl|BL_ORD_ID|'):
                blast_hit_id = hit_id
                id_desc = hit_desc.split(' ', 1)
                hit_id = id_desc[0]
                try:
                    hit_desc = id_desc[1]
                except IndexError:
                    hit_desc = ''
            else:
                blast_hit_id = ''

            # combine primary ID and defline first before splitting
            full_id_desc = hit_id + ' ' + hit_desc
            id_descs = [(x.strip(), y.strip()) for x, y in \
                    [a.split(' ', 1) for a in full_id_desc.split(' >')]]
            hit_id, hit_desc = id_descs[0] 

            hsps = [hsp for hsp in
                    self._parse_hsp(hit_elem.find('Hit_hsps'),
                        query_id, hit_id)]

            hit = Hit(hsps)
            hit.description = hit_desc
            hit._id_alt = [x[0] for x in id_descs[1:]]
            hit._description_alt = [x[1] for x in id_descs[1:]]
            # blast_hit_id is only set if the hit ID is Blast-generated
            hit._blast_id = blast_hit_id

            for key, val_info in _ELEM_HIT.items():
                value = hit_elem.findtext(key)
                if value is not None:
                    caster = val_info[1]
                    # recast only if value is not intended to be str
                    if value is not None and caster is not str:
                        value = caster(value)
                    setattr(hit, val_info[0], value)

            # delete element after we finish parsing it
            hit_elem.clear()
            yield hit

Example #9

0

Show file

File: FastaIO.py Project: znruss/biopython

    def _parse_hit(self, query_id):
        while True:
            self.line = self.handle.readline()
            if self.line.startswith('>>'):
                break

        state = _STATE_NONE
        strand = None
        hsp_list = []
        hsp = None
        parsed_hsp = None
        hit_desc = None
        seq_len = None
        while True:
            peekline = self.handle.peekline()
            # yield hit if we've reached the start of a new query or
            # the end of the search
            if peekline.strip() in [">>><<<", ">>>///"] or \
                    (not peekline.startswith('>>>') and '>>>' in peekline):
                # append last parsed_hsp['hit']['seq'] line
                if state == _STATE_HIT_BLOCK:
                    parsed_hsp['hit']['seq'] += self.line.strip()
                elif state == _STATE_CONS_BLOCK:
                    hsp.aln_annotation['similarity'] += \
                            self.line.strip('\r\n')
                # process HSP alignment and coordinates
                _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
                hit = Hit(hsp_list)
                hit.description = hit_desc
                hit.seq_len = seq_len
                yield hit, strand
                hsp_list = []
                break
            # yield hit and create a new one if we're still in the same query
            elif self.line.startswith('>>'):
                # try yielding,  if we have hsps
                if hsp_list:
                    _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
                    hit = Hit(hsp_list)
                    hit.description = hit_desc
                    hit.seq_len = seq_len
                    yield hit, strand
                    hsp_list = []
                # try to get the hit id and desc, and handle cases without descs
                try:
                    hit_id, hit_desc = self.line[2:].strip().split(' ', 1)
                except ValueError:
                    hit_id = self.line[2:].strip().split(' ', 1)[0]
                    hit_desc = ''
                # create the HSP object for Hit
                frag = HSPFragment(hit_id, query_id)
                hsp = HSP([frag])
                hsp_list.append(hsp)
                # set or reset the state to none
                state = _STATE_NONE
                parsed_hsp = {'query': {}, 'hit': {}}
            # create and append a new HSP if line starts with '>--'
            elif self.line.startswith('>--'):
                # set seq attributes of previous hsp
                _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
                # and create a new one
                frag = HSPFragment(hit_id, query_id)
                hsp = HSP([frag])
                hsp_list.append(hsp)
                # set the state ~ none yet
                state = _STATE_NONE
                parsed_hsp = {'query': {}, 'hit': {}}
            # this is either query or hit data in the HSP, depending on the state
            elif self.line.startswith('>'):
                if state == _STATE_NONE:
                    # make sure it's the correct query
                    if not query_id.startswith(self.line[1:].split(' ')[0]):
                        raise ValueError("%r vs %r" % (query_id, self.line))
                    state = _STATE_QUERY_BLOCK
                    parsed_hsp['query']['seq'] = ''
                elif state == _STATE_QUERY_BLOCK:
                    # make sure it's the correct hit
                    assert hit_id.startswith(self.line[1:].split(' ')[0])
                    state = _STATE_HIT_BLOCK
                    parsed_hsp['hit']['seq'] = ''
            # check for conservation block
            elif self.line.startswith('; al_cons'):
                state = _STATE_CONS_BLOCK
                hsp.fragment.aln_annotation['similarity'] = ''
            elif self.line.startswith(';'):
                # Fasta outputs do not make a clear distinction between Hit
                # and HSPs, so we check the attribute names to determine
                # whether it belongs to a Hit or HSP
                regx = re.search(_RE_ATTR, self.line.strip())
                name = regx.group(1)
                value = regx.group(2)

                # for values before the '>...' query block
                if state == _STATE_NONE:
                    if name in _HSP_ATTR_MAP:
                        attr_name, caster = _HSP_ATTR_MAP[name]
                        if caster is not str:
                            value = caster(value)
                        if name in ['_ident', '_sim']:
                            value *= 100
                        setattr(hsp, attr_name, value)
                # otherwise, pool the values for processing later
                elif state == _STATE_QUERY_BLOCK:
                    parsed_hsp['query'][name] = value
                elif state == _STATE_HIT_BLOCK:
                    if name == '_len':
                        seq_len = int(value)
                    else:
                        parsed_hsp['hit'][name] = value
                # for values in the hit block
                else:
                    raise ValueError("Unexpected line: %r" % self.line)
            # otherwise, it must be lines containing the sequences
            else:
                assert '>' not in self.line
                # if we're in hit, parse into hsp.hit
                if state == _STATE_HIT_BLOCK:
                    parsed_hsp['hit']['seq'] += self.line.strip()
                elif state == _STATE_QUERY_BLOCK:
                    parsed_hsp['query']['seq'] += self.line.strip()
                elif state == _STATE_CONS_BLOCK:
                    hsp.fragment.aln_annotation['similarity'] += \
                            self.line.strip('\r\n')
                # we should not get here!
                else:
                    raise ValueError("Unexpected line: %r" % self.line)

            self.line = self.handle.readline()

Example #10

0

Show file

    def store_bio_searchio_hit(self, searchio_hit: Hit, target: str) -> None:
        """Store bio searchio hit."""
        organism_obj, created = Organism.objects.get_or_create(
            abbreviation="multispecies",
            genus="multispecies",
            species="multispecies",
            common_name="multispecies",
        )

        if not hasattr(searchio_hit, "accession"):
            searchio_hit.accession = None

        # if interproscan-xml parsing, get db name from Hit.attributes.
        if target == "InterPro":
            db_name = searchio_hit.attributes["Target"].upper()
            # prevents the creation of multiple databases for SIGNALP
            if db_name.startswith("SIGNALP"):
                db_name = "SIGNALP"
            db, created = Db.objects.get_or_create(name=db_name)
        # if blast-xml parsing, db name is self.db ("BLAST_source")
        else:
            db = self.db

        dbxref, created = Dbxref.objects.get_or_create(
            db=db, accession=searchio_hit.id)
        feature, created = Feature.objects.get_or_create(
            organism=organism_obj,
            uniquename=searchio_hit.id,
            type_id=self.so_term_protein_match.cvterm_id,
            name=searchio_hit.accession,
            dbxref=dbxref,
            defaults={
                "is_analysis": False,
                "is_obsolete": False,
                "timeaccessioned": datetime.now(timezone.utc),
                "timelastmodified": datetime.now(timezone.utc),
            },
        )

        for aux_dbxref in searchio_hit.dbxrefs:
            aux_db, aux_term = aux_dbxref.split(":", 1)
            if aux_db == "GO":
                try:
                    term_db = Db.objects.get(name=aux_db.upper())
                    dbxref = Dbxref.objects.get(db=term_db, accession=aux_term)
                    cvterm = Cvterm.objects.get(dbxref=dbxref)
                    FeatureCvterm.objects.get_or_create(
                        feature=feature,
                        cvterm=cvterm,
                        pub=self.pub,
                        is_not=False,
                        rank=0,
                    )
                except ObjectDoesNotExist:
                    self.ignored_goterms.add(aux_dbxref)
            else:
                term_db, created = Db.objects.get_or_create(
                    name=aux_db.upper())
                dbxref, created = Dbxref.objects.get_or_create(
                    db=term_db, accession=aux_term)
                FeatureDbxref.objects.get_or_create(feature=feature,
                                                    dbxref=dbxref,
                                                    is_current=1)

        return None

Example #11

0

Show file

File: blast_xml.py Project: troyxmccall/biopython

    def _parse_hit(self, root_hit_elem, query_id):
        """Generator that transforms Iteration_hits XML elements into Hit objects.

        Arguments:
        root_hit_elem -- Element object of the Iteration_hits tag.
        query_id -- String of QueryResult ID of this Hit

        """
        # Hit level processing
        # Hits are stored in the Iteration_hits tag, with the following
        # DTD
        # <!ELEMENT Hit (
        #        Hit_num,
        #        Hit_id,
        #        Hit_def,
        #        Hit_accession,
        #        Hit_len,
        #        Hit_hsps?)>

        # feed the loop below an empty list so iteration still works
        if root_hit_elem is None:
            root_hit_elem = []

        for hit_elem in root_hit_elem:

            # create empty hit object
            hit_id = hit_elem.findtext('Hit_id')
            hit_desc = hit_elem.findtext('Hit_def')
            # handle blast searches against databases with Blast's IDs
            if hit_id.startswith('gnl|BL_ORD_ID|'):
                blast_hit_id = hit_id
                id_desc = hit_desc.split(' ', 1)
                hit_id = id_desc[0]
                try:
                    hit_desc = id_desc[1]
                except IndexError:
                    hit_desc = ''
            else:
                blast_hit_id = ''

            hsps = [hsp for hsp in
                    self._parse_hsp(hit_elem.find('Hit_hsps'),
                        query_id, hit_id)]

            hit = Hit(hsps)
            hit.description = hit_desc
            # blast_hit_id is only set if the hit ID is Blast-generated
            hit._blast_id = blast_hit_id

            for key, val_info in _ELEM_HIT.items():
                value = hit_elem.findtext(key)
                if value is not None:
                    caster = val_info[1]
                    # recast only if value is not intended to be str
                    if value is not None and caster is not str:
                        value = caster(value)
                    setattr(hit, val_info[0], value)

            # delete element after we finish parsing it
            hit_elem.clear()
            yield hit

Example #12

0

Show file

File: blast_text.py Project: ruda830/biovoice

    def __iter__(self):
        """Iterate over BlastTextParser, yields query results."""
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith(">"):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(" ", 1)
            except ValueError:
                qid, qdesc = rec.query, ""
            qdesc = qdesc.replace("\n", "").replace("\r", "")

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine molecule_type based on program
            if qresult.program == "blastn":
                molecule_type = "DNA"
            elif qresult.program in ["blastp", "blastx", "tblastn", "tblastx"]:
                molecule_type = "protein"

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith("> "):
                    aln.title = aln.title[2:]
                elif aln.title.startswith(">"):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(" ", 1)
                except ValueError:
                    hid, hdesc = aln.title, ""
                hdesc = hdesc.replace("\n", "").replace("\r", "")

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.molecule_type = molecule_type
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ("blastp", "tblastn"):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ("blastp", "tblastn"):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start, bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start, bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ""
                    hseq = ""
                    midline = ""
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == " " or hchar == " ":
                            assert all(" " == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation["similarity"] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult

Example #13

0

Show file

File: blast_xml.py Project: sjl421/Example-Codes-and-Learning-Tools

    def _parse_hit(self, root_hit_elem, query_id):
        """Generator that transforms Iteration_hits XML elements into Hit objects.

        :param root_hit_elem: root element of the Iteration_hits tag.
        :type root_hit_elem: XML element tag
        :param query_id: QueryResult ID of this Hit
        :type query_id: string

        """
        # Hit level processing
        # Hits are stored in the Iteration_hits tag, with the following
        # DTD
        # <!ELEMENT Hit (
        #        Hit_num,
        #        Hit_id,
        #        Hit_def,
        #        Hit_accession,
        #        Hit_len,
        #        Hit_hsps?)>

        # feed the loop below an empty list so iteration still works
        if root_hit_elem is None:
            root_hit_elem = []

        for hit_elem in root_hit_elem:

            # create empty hit object
            hit_id = hit_elem.findtext('Hit_id')
            hit_desc = hit_elem.findtext('Hit_def')
            # handle blast searches against databases with Blast's IDs
            if hit_id.startswith('gnl|BL_ORD_ID|'):
                blast_hit_id = hit_id
                id_desc = hit_desc.split(' ', 1)
                hit_id = id_desc[0]
                try:
                    hit_desc = id_desc[1]
                except IndexError:
                    hit_desc = ''
            else:
                blast_hit_id = ''

            # combine primary ID and defline first before splitting
            full_id_desc = hit_id + ' ' + hit_desc
            id_descs = [(x.strip(), y.strip()) for x, y in \
                    [a.split(' ', 1) for a in full_id_desc.split(' >')]]
            hit_id, hit_desc = id_descs[0]

            hsps = [
                hsp for hsp in self._parse_hsp(hit_elem.find('Hit_hsps'),
                                               query_id, hit_id)
            ]

            hit = Hit(hsps)
            hit.description = hit_desc
            hit._id_alt = [x[0] for x in id_descs[1:]]
            hit._description_alt = [x[1] for x in id_descs[1:]]
            # blast_hit_id is only set if the hit ID is Blast-generated
            hit._blast_id = blast_hit_id

            for key, val_info in _ELEM_HIT.items():
                value = hit_elem.findtext(key)
                if value is not None:
                    caster = val_info[1]
                    # recast only if value is not intended to be str
                    if value is not None and caster is not str:
                        value = caster(value)
                    setattr(hit, val_info[0], value)

            # delete element after we finish parsing it
            hit_elem.clear()
            yield hit

Example #14

0

Show file

    def test_store_bio_searchio_hit(self):
        """Tests - store bio searchio hit."""
        # create RO term: contained in
        test_db = Db.objects.create(name="RO")
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        test_cv = Cv.objects.create(name="relationship")
        Cvterm.objects.create(
            name="contained in",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # create SO terms: protein_match
        test_cv = Cv.objects.create(name="sequence")
        test_db = Db.objects.create(name="SO")
        test_dbxref = Dbxref.objects.create(accession="00001", db=test_db)
        Cvterm.objects.create(
            name="protein_match",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        test_dbxref = Dbxref.objects.create(accession="00002", db=test_db)
        Cvterm.objects.create(
            name="polypeptide",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )
        # create GO term
        test_db = Db.objects.create(name="GO")
        test_dbxref = Dbxref.objects.create(accession="1234", db=test_db)
        test_cv = Cv.objects.create(name="biological_process")
        Cvterm.objects.create(
            name="GO:1234",
            cv=test_cv,
            dbxref=test_dbxref,
            is_obsolete=0,
            is_relationshiptype=0,
        )

        # create a bio searchio hit
        test_searchio_hit = Hit()
        test_searchio_hit.id = "PF1234"
        test_searchio_hit.accession = "PFAM mock domain"
        test_searchio_hit.attributes["Target"] = "PFAM"
        test_searchio_hit.dbxrefs = [
            "GO:1234", "IPR:IPR012345", "Reactome:R-HSA-12345"
        ]

        Organism.objects.create(genus="test", species="organism")

        # instantiate the loader
        test_feature_file = FeatureLoader(filename="file.name",
                                          source="InterproScan_source")
        # store the bio searchio hit
        # From interproscan
        target = "InterPro"
        test_feature_file.store_bio_searchio_hit(test_searchio_hit, target)

        test_feature = Feature.objects.get(uniquename="PF1234")
        self.assertEqual("PFAM mock domain", test_feature.name)

        test_dbxref = Dbxref.objects.get(accession="IPR012345")
        test_feature_dbxref = FeatureDbxref.objects.get(feature=test_feature,
                                                        dbxref=test_dbxref)
        self.assertEqual(True, test_feature_dbxref.is_current)

        test_cvterm = Cvterm.objects.get(name="GO:1234")
        test_feature_cvterm = FeatureCvterm.objects.get(feature=test_feature,
                                                        cvterm=test_cvterm)
        self.assertEqual(0, test_feature_cvterm.rank)

Example #15

0

Show file

    def _parse_qresult(self):
        """Yield QueryResult objects (PRIVATE)."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # initial dummy values
        qres_state = None
        file_state = None
        cur_qid, cur_hid = None, None
        prev_qid, prev_hid = None, None
        cur, prev = None, None
        hit_list, hsp_list = [], []

        while True:
            # store previous line's parsed values for all lines after the first
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the result row if it's not EOF
            if self.line:
                cur = self._parse_row()
                cur_qid = cur["qname"]
                cur_hid = cur["tname"]
            else:
                file_state = state_EOF
                # mock values, since we have nothing to parse
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            if prev is not None:
                # create fragment and HSP and set their attributes
                hsp = _create_hsp(prev_hid, prev_qid, prev)
                hsp_list.append(hsp)

                if hit_state == state_HIT_NEW:
                    # create Hit and set its attributes
                    hit = Hit(hsp_list)
                    hit.seq_len = prev["tsize"]
                    hit_list.append(hit)
                    hsp_list = []

                # create qresult and yield if we're at a new qresult or at EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(id=prev_qid)
                    for hit in hit_list:
                        qresult.absorb(hit)
                    qresult.seq_len = prev["qsize"]
                    yield qresult
                    # if we're at EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()

Example #16

0

Show file

    def _parse_qresult(self):
        # state values
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # initial dummies
        qres_state, hit_state = None, None
        file_state = None
        cur_qid, cur_hid = None, None
        prev_qid, prev_hid = None, None
        cur, prev = None, None
        hit_list, hsp_list = [], []
        # if the file has c4 alignments, use that as the alignment mark
        if self.has_c4_alignment:
            self._ALN_MARK = 'C4 Alignment:'

        while True:
            self.read_until(lambda line: line.startswith(self._ALN_MARK))
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the result row if it's not EOF
            if self.line:
                assert self.line.startswith(self._ALN_MARK), self.line
                # create temp dicts for storing parsed values
                header = {'qresult': {}, 'hit': {}, 'hsp': {}}
                # if the file has c4 alignments, try to parse the header
                if self.has_c4_alignment:
                    self.read_until(lambda line:
                                    line.strip().startswith('Query:'))
                    header = self._parse_alignment_header()
                # parse the block contents
                cur = self.parse_alignment_block(header)
                cur_qid = cur['qresult']['id']
                cur_hid = cur['hit']['id']
            elif not self.line or self.line.startswith('-- completed '):
                file_state = state_EOF
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new query
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            if prev is not None:
                hsp = _create_hsp(prev_hid, prev_qid, prev['hsp'])
                hsp_list.append(hsp)

                if hit_state == state_HIT_NEW:
                    hit = Hit(hsp_list)
                    for attr, value in prev['hit'].items():
                        setattr(hit, attr, value)
                    hit_list.append(hit)
                    hsp_list = []

                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(id=prev_qid)
                    for hit in hit_list:
                        # not using append since Exonerate may separate the
                        # same hit if it has different strands
                        qresult.absorb(hit)
                    for attr, value in prev['qresult'].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    if file_state == state_EOF:
                        break
                    hit_list = []

            # only readline() here if we're not parsing C4 alignments
            # C4 alignments readline() is handled by its parse_alignment_block
            # function
            if not self.has_c4_alignment:
                self.line = self.handle.readline()

Example #17

0

Show file

File: blast_xml.py Project: umbrr/biopython

    def _parse_hit(self, root_hit_elem, query_id):
        """Yield a generator object that transforms Iteration_hits XML elements into Hit objects (PRIVATE).

        :param root_hit_elem: root element of the Iteration_hits tag.
        :type root_hit_elem: XML element tag
        :param query_id: QueryResult ID of this Hit
        :type query_id: string

        """
        # Hit level processing
        # Hits are stored in the Iteration_hits tag, with the following
        # DTD
        # <!ELEMENT Hit (
        #        Hit_num,
        #        Hit_id,
        #        Hit_def,
        #        Hit_accession,
        #        Hit_len,
        #        Hit_hsps?)>

        # feed the loop below an empty list so iteration still works
        if root_hit_elem is None:
            root_hit_elem = []

        for hit_elem in root_hit_elem:

            # BLAST sometimes mangles the sequence IDs and descriptions, so we need
            # to extract the actual values.
            raw_hit_id = hit_elem.findtext('Hit_id')
            raw_hit_desc = hit_elem.findtext('Hit_def')
            if not self._use_raw_hit_ids:
                ids, descs, blast_hit_id = _extract_ids_and_descs(
                    raw_hit_id, raw_hit_desc)
            else:
                ids, descs, blast_hit_id = [raw_hit_id], [raw_hit_desc
                                                          ], raw_hit_id

            hit_id, alt_hit_ids = ids[0], ids[1:]
            hit_desc, alt_hit_descs = descs[0], descs[1:]

            hsps = [
                hsp for hsp in self._parse_hsp(hit_elem.find('Hit_hsps'),
                                               query_id, hit_id)
            ]

            hit = Hit(hsps)
            hit.description = hit_desc
            hit._id_alt = alt_hit_ids
            hit._description_alt = alt_hit_descs
            hit.blast_id = blast_hit_id

            for key, val_info in _ELEM_HIT.items():
                value = hit_elem.findtext(key)
                if value is not None:
                    caster = val_info[1]
                    # recast only if value is not intended to be str
                    if value is not None and caster is not str:
                        value = caster(value)
                    setattr(hit, val_info[0], value)

            # delete element after we finish parsing it
            hit_elem.clear()
            yield hit

Example #18

0

Show file

File: hmmer3_tab.py Project: Wilkhu90/AWSLambdaWeedGenomics

    def _parse_qresult(self):
        """Generator function that returns QueryResult objects."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        # initial value dummies
        qres_state = None
        file_state = None
        prev_qid = None
        cur, prev = None, None
        # container for Hit objects, used to create QueryResult
        hit_list = []
        cur_qid = None
        while True:
            # store previous line's parsed values for all lines after the first
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
            # only parse the result row if it's not EOF
            # NOTE: we are not parsing the extra '#' lines appended to the end
            # of hmmer31b1 tabular results since storing them in qresult
            # objects means we can not do a single-pass parsing
            if self.line and not self.line.startswith('#'):
                cur = self._parse_row()
                cur_qid = cur['qresult']['id']
            else:
                file_state = state_EOF
                # mock value for cur_qid, since we have nothing to parse
                cur_qid = None

            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME

            if prev is not None:
                # since domain tab formats only have 1 Hit per line
                # we always create HSPFragment, HSP, and Hit per line
                prev_hid = prev['hit']['id']

                # create fragment and HSP and set their attributes
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev['frag'].items():
                    setattr(frag, attr, value)
                hsp = HSP([frag])
                for attr, value in prev['hsp'].items():
                    setattr(hsp, attr, value)

                # create Hit and set its attributes
                hit = Hit([hsp])
                for attr, value in prev['hit'].items():
                    setattr(hit, attr, value)
                hit_list.append(hit)

                # create qresult and yield if we're at a new qresult or at EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(hit_list, prev_qid)
                    for attr, value in prev['qresult'].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if we're at EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()

Example #19

0

Show file

File: blast_tab.py Project: sgalpha01/biopython

    def _parse_qresult(self):
        """Yield QueryResult objects (PRIVATE)."""
        # state values, used to determine what to do with each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # dummies for initial states
        qres_state = None
        hit_state = None
        file_state = None
        cur_qid = None
        cur_hid = None
        # dummies for initial id caches
        prev_qid = None
        prev_hid = None
        # dummies for initial parsed value containers
        cur, prev = None, None
        hit_list, hsp_list = [], []

        while True:
            # store previous line's parsed values if we've past the first line
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the line if it's not EOF or not a comment line
            if self.line and not self.line.startswith("#"):
                cur = self._parse_result_row()
                cur_qid = self._get_id(cur["qresult"])
                cur_hid = self._get_id(cur["hit"])
            else:
                file_state = state_EOF
                # mock values for cur_qid and cur_hid since the line is empty
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different id or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            # we're creating objects for the previously parsed line(s),
            # so nothing is done in the first parsed line (prev == None)
            if prev is not None:
                # every line is essentially an HSP with one fragment, so we
                # create both of these for every line
                frag = HSPFragment(prev_hid, prev_qid)
                for attr, value in prev["frag"].items():
                    # adjust coordinates to Python range
                    # NOTE: this requires both start and end coords to be
                    # present, otherwise a KeyError will be raised.
                    # Without this limitation, we might misleadingly set the
                    # start / end coords
                    for seq_type in ("query", "hit"):
                        if attr == seq_type + "_start":
                            value = min(value,
                                        prev["frag"][seq_type + "_end"]) - 1
                        elif attr == seq_type + "_end":
                            value = max(value,
                                        prev["frag"][seq_type + "_start"])
                    setattr(frag, attr, value)
                # strand and frame setattr require the full parsed values
                # to be set first
                for seq_type in ("hit", "query"):
                    # try to set hit and query frame
                    frame = self._get_frag_frame(frag, seq_type, prev["frag"])
                    setattr(frag, "%s_frame" % seq_type, frame)
                    # try to set hit and query strand
                    strand = self._get_frag_strand(frag, seq_type,
                                                   prev["frag"])
                    setattr(frag, "%s_strand" % seq_type, strand)

                hsp = HSP([frag])
                for attr, value in prev["hsp"].items():
                    setattr(hsp, attr, value)
                hsp_list.append(hsp)

                # create hit and append to temp hit container if hit_state
                # says we're not at the same hit or at a new query
                if hit_state == state_HIT_NEW:
                    hit = Hit(hsp_list)
                    for attr, value in prev["hit"].items():
                        if attr != "id_all":
                            setattr(hit, attr, value)
                        else:
                            # not setting hit ID since it's already set from the
                            # prev_hid above
                            setattr(hit, "_id_alt", value[1:])
                    hit_list.append(hit)
                    hsp_list = []
                # create qresult and yield if we're at a new qresult or EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(hit_list, prev_qid)
                    for attr, value in prev["qresult"].items():
                        setattr(qresult, attr, value)
                    yield qresult
                    # if current line is EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline().strip()

Example #20

0

Show file

File: FastaIO.py Project: guru1982/biopython

    def _parse_hit(self, query_id):
        while True:
            self.line = self.handle.readline()
            if self.line.startswith('>>'):
                break

        state = _STATE_NONE
        strand = None
        hsp_list = []
        while True:
            peekline = self.handle.peekline()
            # yield hit if we've reached the start of a new query or
            # the end of the search
            if peekline.strip() in [">>><<<", ">>>///"] or \
                    (not peekline.startswith('>>>') and '>>>' in peekline):
                # append last parsed_hsp['hit']['seq'] line
                if state == _STATE_HIT_BLOCK:
                    parsed_hsp['hit']['seq'] += self.line.strip()
                elif state == _STATE_CONS_BLOCK:
                    hsp.aln_annotation['similarity'] += \
                            self.line.strip('\r\n')
                # process HSP alignment and coordinates
                _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
                hit = Hit(hsp_list)
                hit.description = hit_desc
                hit.seq_len = seq_len
                yield hit, strand
                hsp_list = []
                break
            # yield hit and create a new one if we're still in the same query
            elif self.line.startswith('>>'):
                # try yielding,  if we have hsps
                if hsp_list:
                    _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
                    hit = Hit(hsp_list)
                    hit.description = hit_desc
                    hit.seq_len = seq_len
                    yield hit, strand
                    hsp_list = []
                # try to get the hit id and desc, and handle cases without descs
                try:
                    hit_id, hit_desc = self.line[2:].strip().split(' ', 1)
                except ValueError:
                    hit_id = self.line[2:].strip().split(' ', 1)[0]
                    hit_desc = ''
                # create the HSP object for Hit
                frag = HSPFragment(hit_id, query_id)
                hsp = HSP([frag])
                hsp_list.append(hsp)
                # set or reset the state to none
                state = _STATE_NONE
                parsed_hsp = {'query': {}, 'hit': {}}
            # create and append a new HSP if line starts with '>--'
            elif self.line.startswith('>--'):
                # set seq attributes of previous hsp
                _set_hsp_seqs(hsp, parsed_hsp, self._preamble['program'])
                # and create a new one
                frag = HSPFragment(hit_id, query_id)
                hsp = HSP([frag])
                hsp_list.append(hsp)
                # set the state ~ none yet
                state = _STATE_NONE
                parsed_hsp = {'query': {}, 'hit': {}}
            # this is either query or hit data in the HSP, depending on the state
            elif self.line.startswith('>'):
                if state == _STATE_NONE:
                    # make sure it's the correct query
                    assert query_id.startswith(self.line[1:].split(' ')[0]), \
                            "%r vs %r" % (query_id, self.line)
                    state = _STATE_QUERY_BLOCK
                    parsed_hsp['query']['seq'] = ''
                elif state == _STATE_QUERY_BLOCK:
                    # make sure it's the correct hit
                    assert hit_id.startswith(self.line[1:].split(' ')[0])
                    state = _STATE_HIT_BLOCK
                    parsed_hsp['hit']['seq'] = ''
            # check for conservation block
            elif self.line.startswith('; al_cons'):
                state = _STATE_CONS_BLOCK
                hsp.fragment.aln_annotation['similarity'] = ''
            elif self.line.startswith(';'):
                # Fasta outputs do not make a clear distinction between Hit
                # and HSPs, so we check the attribute names to determine
                # whether it belongs to a Hit or HSP
                regx = re.search(_RE_ATTR, self.line.strip())
                name = regx.group(1)
                value = regx.group(2)

                # for values before the '>...' query block
                if state == _STATE_NONE:
                    if name in _HSP_ATTR_MAP:
                        attr_name, caster = _HSP_ATTR_MAP[name]
                        if caster is not str:
                            value = caster(value)
                        if name in ['_ident', '_sim']:
                            value *= 100
                        setattr(hsp, attr_name, value)
                # otherwise, pool the values for processing later
                elif state == _STATE_QUERY_BLOCK:
                    parsed_hsp['query'][name] = value
                elif state == _STATE_HIT_BLOCK:
                    if name == '_len':
                        seq_len = int(value)
                    else:
                        parsed_hsp['hit'][name] = value
                # for values in the hit block
                else:
                    raise ValueError("Unexpected line: %r" % self.line)
            # otherwise, it must be lines containing the sequences
            else:
                assert '>' not in self.line
                # if we're in hit, parse into hsp.hit
                if state == _STATE_HIT_BLOCK:
                    parsed_hsp['hit']['seq'] += self.line.strip()
                elif state == _STATE_QUERY_BLOCK:
                    parsed_hsp['query']['seq'] += self.line.strip()
                elif state == _STATE_CONS_BLOCK:
                    hsp.fragment.aln_annotation['similarity'] += \
                            self.line.strip('\r\n')
                # we should not get here!
                else:
                    raise ValueError("Unexpected line: %r" % self.line)

            self.line = self.handle.readline()

Example #21

0

Show file

File: hmmer3_text.py Project: sannepost2001/blok4

    def _create_hits(self, hit_attrs, qid, qdesc):
        """Parse a HMMER3 hsp block, beginning with the hsp table (PRIVATE)."""
        # read through until the beginning of the hsp block
        self._read_until(lambda line: line.startswith('Internal pipeline') or
                         line.startswith('>>'))

        # start parsing the hsp block
        hit_list = []
        while True:
            if self.line.startswith('Internal pipeline'):
                # by this time we should've emptied the hit attr list
                assert len(hit_attrs) == 0
                return hit_list
            assert self.line.startswith('>>')
            hid, hdesc = self.line[len('>> '):].split('  ', 1)
            hdesc = hdesc.strip()

            # read through the hsp table header and move one more line
            self._read_until(
                lambda line: line.startswith(' ---   ------ ----- --------') or
                line.startswith('   [No individual domains'))
            self.line = read_forward(self.handle)

            # parse the hsp table for the current hit
            hsp_list = []
            while True:
                # break out of hsp parsing if there are no hits, it's the last hsp
                # or it's the start of a new hit
                if self.line.startswith('   [No targets detected that satisfy') or \
                        self.line.startswith('   [No individual domains') or \
                        self.line.startswith('Internal pipeline statistics summary:') or \
                        self.line.startswith('  Alignments for each domain:') or \
                        self.line.startswith('>>'):

                    hit_attr = hit_attrs.pop(0)
                    hit = Hit(hsp_list)
                    for attr, value in hit_attr.items():
                        if attr == "description":
                            cur_val = getattr(hit, attr)
                            if cur_val and value and cur_val.startswith(value):
                                continue
                        setattr(hit, attr, value)
                    if not hit:
                        hit.query_description = qdesc
                    hit_list.append(hit)
                    break

                parsed = [x for x in self.line.strip().split(' ') if x]
                assert len(parsed) == 16
                # parsed column order:
                # index, is_included, bitscore, bias, evalue_cond, evalue
                # hmmfrom, hmmto, query_ends, hit_ends, alifrom, alito,
                # envfrom, envto, acc_avg
                frag = HSPFragment(hid, qid)
                # set query and hit descriptions if they are defined / nonempty string
                if qdesc:
                    frag.query_description = qdesc
                if hdesc:
                    frag.hit_description = hdesc
                # HMMER3 alphabets are always protein alphabets
                frag.alphabet = generic_protein
                # depending on whether the program is hmmsearch, hmmscan, or phmmer
                # {hmm,ali}{from,to} can either be hit_{from,to} or query_{from,to}
                # for hmmscan, hit is the hmm profile, query is the sequence
                if self._meta.get('program') == 'hmmscan':
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[6]) - 1
                    frag.hit_end = int(parsed[7])
                    frag.query_start = int(parsed[9]) - 1
                    frag.query_end = int(parsed[10])
                elif self._meta.get('program') in ['hmmsearch', 'phmmer']:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    frag.hit_start = int(parsed[9]) - 1
                    frag.hit_end = int(parsed[10])
                    frag.query_start = int(parsed[6]) - 1
                    frag.query_end = int(parsed[7])
                # strand is always 0, since HMMER now only handles protein
                frag.hit_strand = frag.query_strand = 0

                hsp = HSP([frag])
                hsp.domain_index = int(parsed[0])
                hsp.is_included = parsed[1] == '!'
                hsp.bitscore = float(parsed[2])
                hsp.bias = float(parsed[3])
                hsp.evalue_cond = float(parsed[4])
                hsp.evalue = float(parsed[5])
                if self._meta.get('program') == 'hmmscan':
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[8]
                    hsp.query_endtype = parsed[11]
                elif self._meta.get('program') in ['hmmsearch', 'phmmer']:
                    # adjust 'from' and 'to' coordinates to 0-based ones
                    hsp.hit_endtype = parsed[11]
                    hsp.query_endtype = parsed[8]
                # adjust 'from' and 'to' coordinates to 0-based ones
                hsp.env_start = int(parsed[12]) - 1
                hsp.env_end = int(parsed[13])
                hsp.env_endtype = parsed[14]
                hsp.acc_avg = float(parsed[15])

                hsp_list.append(hsp)
                self.line = read_forward(self.handle)

            # parse the hsp alignments
            if self.line.startswith('  Alignments for each domain:'):
                self._parse_aln_block(hid, hit.hsps)

Example #22

0

Show file

File: BlatIO.py Project: olgabot/biopython

    def _parse_qresult(self):
        """Generator function that returns QueryResult objects."""
        # state values, determines what to do for each line
        state_EOF = 0
        state_QRES_NEW = 1
        state_QRES_SAME = 3
        state_HIT_NEW = 2
        state_HIT_SAME = 4
        # initial dummy values
        qres_state = None
        file_state = None
        prev_qid, prev_hid = None, None
        cur, prev = None, None
        hit_list, hsp_list = [], []

        while True:
            # store previous line's parsed values for all lines after the first
            if cur is not None:
                prev = cur
                prev_qid = cur_qid
                prev_hid = cur_hid
            # only parse the result row if it's not EOF
            if self.line:
                cur = self._parse_row()
                cur_qid = cur['qname']
                cur_hid = cur['tname']
            else:
                file_state = state_EOF
                # mock values, since we have nothing to parse
                cur_qid, cur_hid = None, None

            # get the state of hit and qresult
            if prev_qid != cur_qid:
                qres_state = state_QRES_NEW
            else:
                qres_state = state_QRES_SAME
            # new hits are hits with different ids or hits in a new qresult
            if prev_hid != cur_hid or qres_state == state_QRES_NEW:
                hit_state = state_HIT_NEW
            else:
                hit_state = state_HIT_SAME

            if prev is not None:
                # create fragment and HSP and set their attributes
                hsp = _create_hsp(prev_hid, prev_qid, prev)
                hsp_list.append(hsp)

                if hit_state == state_HIT_NEW:
                    # create Hit and set its attributes
                    hit = Hit(hsp_list)
                    hit.seq_len = prev['tsize']
                    hit_list.append(hit)
                    hsp_list = []

                # create qresult and yield if we're at a new qresult or at EOF
                if qres_state == state_QRES_NEW or file_state == state_EOF:
                    qresult = QueryResult(prev_qid)
                    for hit in hit_list:
                        qresult.absorb(hit)
                    qresult.seq_len = prev['qsize']
                    yield qresult
                    # if we're at EOF, break
                    if file_state == state_EOF:
                        break
                    hit_list = []

            self.line = self.handle.readline()

Example #23

0

Show file

File: blast_text.py Project: Ambuj-UF/ConCat-1.0

    def __iter__(self):
        for rec in self.blast_iter:
            # set attributes to SearchIO's
            # get id and desc
            if rec.query.startswith('>'):
                rec.query = rec.query[1:]
            try:
                qid, qdesc = rec.query.split(' ', 1)
            except ValueError:
                qid, qdesc = rec.query, ''
            qdesc = qdesc.replace('\n', '').replace('\r', '')

            qresult = QueryResult(id=qid)
            qresult.program = rec.application.lower()
            qresult.target = rec.database
            qresult.seq_len = rec.query_letters
            qresult.version = rec.version

            # determine alphabet based on program
            if qresult.program == 'blastn':
                alphabet = generic_dna
            elif qresult.program in ['blastp', 'blastx', 'tblastn', 'tblastx']:
                alphabet = generic_protein

            # iterate over the 'alignments' (hits) and the hit table
            for idx, aln in enumerate(rec.alignments):
                # get id and desc
                if aln.title.startswith('> '):
                    aln.title = aln.title[2:]
                elif aln.title.startswith('>'):
                    aln.title = aln.title[1:]
                try:
                    hid, hdesc = aln.title.split(' ', 1)
                except ValueError:
                    hid, hdesc = aln.title, ''
                hdesc = hdesc.replace('\n', '').replace('\r', '')

                # iterate over the hsps and group them in a list
                hsp_list = []
                for bhsp in aln.hsps:
                    frag = HSPFragment(hid, qid)
                    frag.alphabet = alphabet
                    # set alignment length
                    frag.aln_span = bhsp.identities[1]
                    # set frames
                    try:
                        frag.query_frame = int(bhsp.frame[0])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.query_frame = 0
                        else:
                            frag.query_frame = 1
                    try:
                        frag.hit_frame = int(bhsp.frame[1])
                    except IndexError:
                        if qresult.program in ('blastp', 'tblastn'):
                            frag.hit_frame = 0
                        else:
                            frag.hit_frame = 1
                    # set query coordinates
                    frag.query_start = min(bhsp.query_start,
                            bhsp.query_end) - 1
                    frag.query_end = max(bhsp.query_start, bhsp.query_end)
                    # set hit coordinates
                    frag.hit_start = min(bhsp.sbjct_start,
                            bhsp.sbjct_end) - 1
                    frag.hit_end = max(bhsp.sbjct_start, bhsp.sbjct_end)
                    # set query, hit sequences and its annotation
                    qseq = ''
                    hseq = ''
                    midline = ''
                    for seqtrio in zip(bhsp.query, bhsp.sbjct, bhsp.match):
                        qchar, hchar, mchar = seqtrio
                        if qchar == ' ' or hchar == ' ':
                            assert all(' ' == x for x in seqtrio)
                        else:
                            qseq += qchar
                            hseq += hchar
                            midline += mchar
                    frag.query, frag.hit = qseq, hseq
                    frag.aln_annotation['similarity'] = midline

                    # create HSP object with the fragment
                    hsp = HSP([frag])
                    hsp.evalue = bhsp.expect
                    hsp.bitscore = bhsp.bits
                    hsp.bitscore_raw = bhsp.score
                    # set gap
                    try:
                        hsp.gap_num = bhsp.gaps[0]
                    except IndexError:
                        hsp.gap_num = 0
                    # set identity
                    hsp.ident_num = bhsp.identities[0]
                    hsp.pos_num = bhsp.positives[0]
                    if hsp.pos_num is None:
                        hsp.pos_num = hsp[0].aln_span

                    hsp_list.append(hsp)

                hit = Hit(hsp_list)
                hit.seq_len = aln.length
                hit.description = hdesc
                qresult.append(hit)

            qresult.description = qdesc
            yield qresult

Example #24

0

Show file

File: blast_xml.py Project: harshberia93/biopython

    def _parse_hit(self, root_hit_elem, query_id):
        """Generator that transforms Iteration_hits XML elements into Hit objects.

        Arguments:
        root_hit_elem -- Element object of the Iteration_hits tag.
        query_id -- String of QueryResult ID of this Hit

        """
        # Hit level processing
        # Hits are stored in the Iteration_hits tag, with the following
        # DTD
        # <!ELEMENT Hit (
        #        Hit_num,
        #        Hit_id,
        #        Hit_def,
        #        Hit_accession,
        #        Hit_len,
        #        Hit_hsps?)>

        # feed the loop below an empty list so iteration still works
        if root_hit_elem is None:
            root_hit_elem = []

        for hit_elem in root_hit_elem:

            # create empty hit object
            hit_id = hit_elem.findtext('Hit_id')
            hit_desc = hit_elem.findtext('Hit_def')
            # handle blast searches against databases with Blast's IDs
            if hit_id.startswith('gnl|BL_ORD_ID|'):
                blast_hit_id = hit_id
                id_desc = hit_desc.split(' ', 1)
                hit_id = id_desc[0]
                try:
                    hit_desc = id_desc[1]
                except IndexError:
                    hit_desc = ''
            else:
                blast_hit_id = ''

            hsps = [hsp for hsp in
                    self._parse_hsp(hit_elem.find('Hit_hsps'),
                        query_id, hit_id)]

            hit = Hit(hsps)
            hit.description = hit_desc
            # blast_hit_id is only set if the hit ID is Blast-generated
            hit._blast_id = blast_hit_id

            for key, val_info in _ELEM_HIT.items():
                value = hit_elem.findtext(key)
                if value is not None:
                    caster = val_info[1]
                    # recast only if value is not intended to be str
                    if value is not None and caster is not str:
                        value = caster(value)
                    setattr(hit, val_info[0], value)

            # delete element after we finish parsing it
            hit_elem.clear()
            yield hit

Example #25

0

Show file

    def _parse_hit(self, query_id):
        """Parse hit on query identifier (PRIVATE)."""
        while True:
            line = self.handle.readline()
            if line.startswith(">>"):
                break

        state = _STATE_NONE
        strand = None
        hsp_list = []
        hsp = None
        parsed_hsp = None
        hit_desc = None
        seq_len = None
        while True:
            # yield hit if we've reached the start of a new query or
            # the end of the search
            self.line = self.handle.readline()
            if self.line.strip() in [
                    ">>><<<", ">>>///"
            ] or (not self.line.startswith(">>>") and ">>>" in self.line):
                # append last parsed_hsp['hit']['seq'] line
                if state == _STATE_HIT_BLOCK:
                    parsed_hsp["hit"]["seq"] += line.strip()
                elif state == _STATE_CONS_BLOCK:
                    hsp.aln_annotation["similarity"] += line.strip("\r\n")
                # process HSP alignment and coordinates
                _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"])
                hit = Hit(hsp_list)
                hit.description = hit_desc
                hit.seq_len = seq_len
                yield hit, strand
                hsp_list = []
                break
            # yield hit and create a new one if we're still in the same query
            elif line.startswith(">>"):
                # try yielding,  if we have hsps
                if hsp_list:
                    _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"])
                    hit = Hit(hsp_list)
                    hit.description = hit_desc
                    hit.seq_len = seq_len
                    yield hit, strand
                    hsp_list = []
                # try to get the hit id and desc, and handle cases without descs
                try:
                    hit_id, hit_desc = line[2:].strip().split(" ", 1)
                except ValueError:
                    hit_id = line[2:].strip().split(" ", 1)[0]
                    hit_desc = ""
                # create the HSP object for Hit
                frag = HSPFragment(hit_id, query_id)
                hsp = HSP([frag])
                hsp_list.append(hsp)
                # set or reset the state to none
                state = _STATE_NONE
                parsed_hsp = {"query": {}, "hit": {}}
            # create and append a new HSP if line starts with '>--'
            elif line.startswith(">--"):
                # set seq attributes of previous hsp
                _set_hsp_seqs(hsp, parsed_hsp, self._preamble["program"])
                # and create a new one
                frag = HSPFragment(hit_id, query_id)
                hsp = HSP([frag])
                hsp_list.append(hsp)
                # set the state ~ none yet
                state = _STATE_NONE
                parsed_hsp = {"query": {}, "hit": {}}
            # this is either query or hit data in the HSP, depending on the state
            elif line.startswith(">"):
                if state == _STATE_NONE:
                    # make sure it's the correct query
                    if not query_id.startswith(line[1:].split(" ")[0]):
                        raise ValueError(f"{query_id!r} vs {line!r}")
                    state = _STATE_QUERY_BLOCK
                    parsed_hsp["query"]["seq"] = ""
                elif state == _STATE_QUERY_BLOCK:
                    # make sure it's the correct hit
                    assert hit_id.startswith(line[1:].split(" ")[0])
                    state = _STATE_HIT_BLOCK
                    parsed_hsp["hit"]["seq"] = ""
            # check for conservation block
            elif line.startswith("; al_cons"):
                state = _STATE_CONS_BLOCK
                hsp.fragment.aln_annotation["similarity"] = ""
            elif line.startswith(";"):
                # Fasta outputs do not make a clear distinction between Hit
                # and HSPs, so we check the attribute names to determine
                # whether it belongs to a Hit or HSP
                regx = re.search(_RE_ATTR, line.strip())
                name = regx.group(1)
                value = regx.group(2)

                # for values before the '>...' query block
                if state == _STATE_NONE:
                    if name in _HSP_ATTR_MAP:
                        attr_name, caster = _HSP_ATTR_MAP[name]
                        if caster is not str:
                            value = caster(value)
                        if name in ["_ident", "_sim"]:
                            value *= 100
                        setattr(hsp, attr_name, value)
                # otherwise, pool the values for processing later
                elif state == _STATE_QUERY_BLOCK:
                    parsed_hsp["query"][name] = value
                elif state == _STATE_HIT_BLOCK:
                    if name == "_len":
                        seq_len = int(value)
                    else:
                        parsed_hsp["hit"][name] = value
                # for values in the hit block
                else:
                    raise ValueError("Unexpected line: %r" % line)
            # otherwise, it must be lines containing the sequences
            else:
                assert ">" not in line
                # if we're in hit, parse into hsp.hit
                if state == _STATE_HIT_BLOCK:
                    parsed_hsp["hit"]["seq"] += line.strip()
                elif state == _STATE_QUERY_BLOCK:
                    parsed_hsp["query"]["seq"] += line.strip()
                elif state == _STATE_CONS_BLOCK:
                    hsp.fragment.aln_annotation["similarity"] += line.strip(
                        "\r\n")
                # we should not get here!
                else:
                    raise ValueError("Unexpected line: %r" % line)
            line = self.line