def testSort(self):
        hits = HHpredHitList([self.h2, self.h1])

        self.assertEqual(hits[0], self.h2)
        self.assertEqual(hits[1], self.h1)
        hits.sort()
        self.assertEqual(hits[0], self.h1)
        self.assertEqual(hits[1], self.h2)
    def testSort(self):
        hits = HHpredHitList([self.h2, self.h1])

        self.assertEqual(hits[0], self.h2)
        self.assertEqual(hits[1], self.h1)
        hits.sort()
        self.assertEqual(hits[0], self.h1)
        self.assertEqual(hits[1], self.h2)
    def setUp(self):

        super(TestHitList, self).setUp()

        self.h1 = HHpredHit(1, 'hit1', 2, 5, 3, 6, 0.5, 10)
        self.h2 = HHpredHit(2, 'hit2', 3, 5, 4, 6, 0.2, 10)

        self.h1.add_alignment('A-CD', 'A-CD')
        self.h2.add_alignment('BCD', 'BCD')

        self.hits = HHpredHitList([self.h1, self.h2])
Exemple #4
0
    def _parse(self, stream, header_only):

        qlen = None
        in_hits = False
        in_alis = False
        has_alis = False
        c_rank = 0
        header = {}
        hits = {}
        alis = {}

        for line in stream:

            if not in_hits and not in_alis:

                if line.replace(' ', '').startswith('NoHitProbE-value'):
                    in_hits = True
                    continue
                elif line.strip() == '':
                    continue
                else:  # parse header data (stuff above the hits table)
                    columns = line.strip().split(None, 1)
                    if len(columns) == 2:

                        identifier, data = columns
                        if identifier in ('Query', 'Command'):
                            data = data.strip()
                        elif identifier == 'Neff':
                            data = float(data)
                        elif identifier in ('Searched_HMMs', 'Match_columns'):
                            data = int(data)

                        header[identifier] = data

                        if identifier == 'Match_columns':
                            qlen = data

            if in_hits and not header_only:
                if not line.strip():  # suboptimal way to handle block switch
                    in_hits = False
                    in_alis = True
                    if self.alignments:
                        continue
                    else:
                        break
                elif line.strip() == 'Done':
                    in_hits = False
                    in_alis = False
                    break

                description = line[:34].split()
                rank = int(description[0])
                id = description[1]

                pos = line[85:94].strip()
                start, end = map(int, pos.split('-'))

                qpos = line[75:84].strip()
                qstart, qend = map(int, qpos.split('-'))

                probability = float(line[35:40]) / 100.0

                hit = HHpredHit(rank, id, start, end, qstart, qend,
                                probability, qlen)

                hit.evalue = float(line[41:48])
                hit.pvalue = float(line[49:56])
                hit.score = float(line[57:63])
                hit.ss_score = float(line[64:69])

                hit.slength = int(line[94:].replace('(', '').replace(')', ''))

                hits[hit.rank] = hit
                alis[hit.rank] = {'q': [], 's': []}

            elif in_alis and not header_only:
                if line.startswith('Done'):
                    in_alis = False
                    break

                elif line.startswith('No '):
                    c_rank = int(line[3:])
                    if c_rank not in hits:
                        raise HHOutputFormatError(
                            'Alignment {0}. refers to a non-existing hit'.
                            format(c_rank))

                elif line.startswith('>'):
                    hits[c_rank].name = line[1:].strip()

                elif line.startswith('Probab='):
                    for pair in line.split():
                        key, value = pair.split('=')
                        if key == 'Identities':
                            hits[c_rank].identity = float(
                                value.replace('%', ''))
                        elif key == 'Similarity':
                            hits[c_rank].similarity = float(value)
                        elif key == 'Sum_probs':
                            hits[c_rank].prob_sum = float(value)

                elif line.startswith('Q ') and not line[:11].rstrip() in (
                        'Q Consensus', 'Q ss_pred', 'Q ss_conf', 'Q ss_dssp'):
                    for residue in line[22:]:
                        if residue.isspace() or residue.isdigit():
                            break
                        else:
                            alis[c_rank]['q'].append(residue)
                            has_alis = True

                elif line.startswith('T ') and not line[:11].rstrip() in (
                        'T Consensus', 'T ss_pred', 'T ss_conf', 'T ss_dssp'):
                    for residue in line[22:]:
                        if residue.isspace() or residue.isdigit():
                            break
                        else:
                            alis[c_rank]['s'].append(residue)

        if self.alignments and has_alis:
            for rank in alis:
                try:
                    hits[rank].add_alignment(alis[rank]['q'], alis[rank]['s'])

                except (KeyError, ValueError) as er:
                    raise HHOutputFormatError(
                        'Corrupt alignment at hit No {0}.\n {1}'.format(
                            rank, er))

        del alis

        hits = HHpredHitList(hits.values())

        hits.sort()

        ## add data obtained from the header to the HHpredHitList
        for identifier, data in header.items():
            if identifier == 'Query':
                hits.query_name = data
            elif identifier == 'Match_columns':
                hits.match_columns = data
            elif identifier == 'No_of_seqs':
                hits.no_of_seqs = data
            elif identifier == 'Neff':
                hits.neff = data
            elif identifier == 'Searched_HMMs':
                hits.searched_hmms = data
            elif identifier == 'Date':
                hits.date = data
            elif identifier == 'Command':
                hits.command = data

        return hits
Exemple #5
0
    def _parse(self, stream, header_only):

        qlen = None
        in_hits = False
        in_alis = False
        has_alis = False
        c_rank = 0
        header = {}
        hits = {}
        alis = {}

        for line in stream:

            if not in_hits and not in_alis:

                if line.replace(' ', '').startswith('NoHitProbE-value'):
                    in_hits = True
                    continue
                elif line.strip() == '':
                    continue
                else:  # parse header data (stuff above the hits table)
                    columns = line.strip().split(None, 1)
                    if len(columns) == 2:

                        identifier, data = columns
                        if identifier in ('Query', 'Command'):
                            data = data.strip()
                        elif identifier == 'Neff':
                            data = float(data)
                        elif identifier in ('Searched_HMMs', 'Match_columns'):
                            data = int(data)

                        header[identifier] = data

                        if identifier == 'Match_columns':
                            qlen = data

            if in_hits and not header_only:
                if not line.strip():  # suboptimal way to handle block switch
                    in_hits = False
                    in_alis = True
                    if self.alignments:
                        continue
                    else:
                        break
                elif line.strip() == 'Done':
                    in_hits = False
                    in_alis = False
                    break

                description = line[:34].split()
                rank = int(description[0]) 
                id = description[1]

                pos = line[85:94].strip()
                start, end = map(int, pos.split('-'))

                qpos = line[75:84].strip()
                qstart, qend = map(int, qpos.split('-'))

                probability = float(line[35:40]) / 100.0

                hit = HHpredHit(rank, id, start, end, qstart, qend, probability, qlen)

                hit.evalue = float(line[41:48])
                hit.pvalue = float(line[49:56])
                hit.score = float(line[57:63])
                hit.ss_score = float(line[64:69])

                hit.slength = int(line[94:].replace('(', '').replace(')', ''))

                hits[hit.rank] = hit
                alis[hit.rank] = {'q': [], 's': []}

            elif in_alis and not header_only:
                if line.startswith('Done'):
                    in_alis = False
                    break
                
                elif line.startswith('No '):
                    c_rank = int(line[3:])
                    if c_rank not in hits:
                        raise HHOutputFormatError('Alignment {0}. refers to a non-existing hit'.format(c_rank))
                    
                elif line.startswith('>'):
                    hits[c_rank].name = line[1:].strip()
                    
                elif line.startswith('Probab='):
                    for pair in line.split():
                        key, value = pair.split('=')
                        if key == 'Identities':
                            hits[c_rank].identity = float(
                                value.replace('%', ''))
                        elif key == 'Similarity':
                            hits[c_rank].similarity = float(value)
                        elif key == 'Sum_probs':
                            hits[c_rank].prob_sum = float(value)
                            
                elif line.startswith('Q ') and not line[:11].rstrip() in ('Q Consensus', 'Q ss_pred','Q ss_conf', 'Q ss_dssp'):
                    for residue in line[22:]:
                        if residue.isspace() or residue.isdigit():
                            break
                        else:
                            alis[c_rank]['q'].append(residue)
                            has_alis = True
                            
                elif line.startswith('T ') and not line[:11].rstrip() in ('T Consensus', 'T ss_pred','T ss_conf', 'T ss_dssp'):
                    for residue in line[22:]:
                        if residue.isspace() or residue.isdigit():
                            break
                        else:
                            alis[c_rank]['s'].append(residue)

        if self.alignments and has_alis:
            for rank in alis:
                try:
                    hits[rank].add_alignment(alis[rank]['q'], alis[rank]['s'])
                    
                except (KeyError, ValueError) as er:
                    raise HHOutputFormatError('Corrupt alignment at hit No {0}.\n {1}'.format(rank, er))

        del alis

        hits = HHpredHitList(hits.values())

        hits.sort()

        ## add data obtained from the header to the HHpredHitList
        for identifier, data in header.items():
            if identifier == 'Query':
                hits.query_name = data
            elif identifier == 'Match_columns':
                hits.match_columns = data
            elif identifier == 'No_of_seqs':
                hits.no_of_seqs = data
            elif identifier == 'Neff':
                hits.neff = data
            elif identifier == 'Searched_HMMs':
                hits.searched_hmms = data
            elif identifier == 'Date':
                hits.date = data
            elif identifier == 'Command':
                hits.command = data

        return hits