def _start_Hit(self):
     self._blast.alignments.append(Record.Alignment())
     self._blast.descriptions.append(Record.Description())
     self._blast.multiple_alignment = []
     self._hit = self._blast.alignments[-1]
     self._descr = self._blast.descriptions[-1]
     self._descr.num_alignments = 0
 def _start_Hsp(self):
     # Note that self._start_Hit() should have been called
     # to setup things like self._blast.multiple_alignment
     self._hit.hsps.append(Record.HSP())
     self._hsp = self._hit.hsps[-1]
     self._descr.num_alignments += 1
     self._blast.multiple_alignment.append(Record.MultipleAlignment())
     self._mult_al = self._blast.multiple_alignment[-1]
Example #3
0
 def _start_Hit(self):
     """Start filling records (PRIVATE)."""
     self._blast.alignments.append(Record.Alignment())
     self._blast.descriptions.append(Record.Description())
     self._blast.multiple_alignment = []
     self._hit = self._blast.alignments[-1]
     self._descr = self._blast.descriptions[-1]
     self._descr.num_alignments = 0
    def _start_hit(self):
        """Start filling records (PRIVATE)."""
        self._blast.alignments.append(Record.Alignment())
        self._descr = Record.Description() if self._xml_version == 1 else Record.DescriptionExt()
        self._blast.descriptions.append(self._descr)
        self._blast.multiple_alignment = []
        self._hit = self._blast.alignments[-1]

        self._descr.num_alignments = 0
Example #5
0
def blastalignmentfromdict(indict):
    out = Record.Alignment()
    keys = set(indict.keys())
    keys.remove('hsps')
    for key in keys:
        setattr(out, key, indict[key])
    out.hsps = [hspfromdict(h) for h in indict['hsps']]
    return out
Example #6
0
def blastfromdict(indict):
    out = Record.Blast()
    for key in indict.keys():
        if key == 'descriptions':
            out.descriptions = [universalfromdict(d, Record.Description) for d in indict['descriptions']]
        elif key == 'alignments':
            out.alignments = [blastalignmentfromdict(balig) for balig in indict['alignments']]
        else:
            setattr(out, key, indict[key])
    return out
 def _start_Iteration(self):
     self._blast = Record.Blast()
     pass
 def reset(self):
     """Reset all the data allowing reuse of the BlastParser() object."""
     self._records = []
     self._header = Record.Header()
     self._parameters = Record.Parameters()
     self._parameters.filter = None  # Maybe I should update the class?
Example #9
0
 def _start_Iteration(self):
     """Start interaction (PRIVATE)."""
     self._blast = Record.Blast()
     pass
Example #10
0
 def _start_hit_descr_item(self):
     """XML v2. Start hit description item."""
     self._hit_descr_item = Record.DescriptionExtItem()
Example #11
0
 def _start_blast_record(self):
     """Start interaction (PRIVATE)."""
     self._blast = Record.Blast()
Example #12
0
def split_hsp(hsp, pos, blast_stats_calc):
    '''
    :param pos: Position in HSP to split at (relative to start of HSP).
    :return: Two new HSPs
    '''
    def trim_ends(q_seq, s_seq, reverse=True):
        '''
        Trim ends of sequences when ends do not help alignment
        '''
        num_bases_to_trim = 0
        direction = -1 if reverse else 1

        for bases in list(zip(q_seq, s_seq))[::direction]:
            if '-' in bases or len(set(bases)) == 2:
                num_bases_to_trim += 1
            else:
                break

        if num_bases_to_trim == 0:
            return q_seq, s_seq, 0
        elif reverse:
            return q_seq[:-1 *
                         num_bases_to_trim], s_seq[:-1 *
                                                   num_bases_to_trim], num_bases_to_trim
        else:
            return q_seq[num_bases_to_trim:], s_seq[
                num_bases_to_trim:], num_bases_to_trim

    def get_alignment_stats(q_seq, s_seq):
        midline = ''
        num_gap_bases = 0
        num_identities = 0
        num_mismatches = 0

        for i in range(len(q_seq)):
            if '-' == q_seq[i] or '-' == s_seq[i]:
                midline += ' '
                num_gap_bases += 1
            elif q_seq[i] != s_seq[i]:
                midline += ' '
                num_mismatches += 1
            else:
                midline += '|'
                num_identities += 1
        return midline, num_identities, num_mismatches, num_gap_bases

    # Empty biopython HSP objects
    hsp_A = Record.HSP()
    hsp_B = Record.HSP()

    # Get sequences for new HSPs
    query_A, sbjct_A, num_bases_trimmed_A = trim_ends(hsp.query[:pos],
                                                      hsp.sbjct[:pos])
    hsp_A.query = query_A
    hsp_A.sbjct = sbjct_A
    query_B, sbjct_B, num_bases_trimmed_B = trim_ends(hsp.query[pos:],
                                                      hsp.sbjct[pos:], False)
    hsp_B.query = query_B
    hsp_B.sbjct = sbjct_B

    hsp_A.align_length = len(query_A)
    hsp_B.align_length = len(query_B)

    # Get alignment stats
    midline_A, num_identities_A, num_mismatches_A, num_gap_bases_A = get_alignment_stats(
        query_A, sbjct_A)
    midline_B, num_identities_B, num_mismatches_B, num_gap_bases_B = get_alignment_stats(
        query_B, sbjct_B)

    hsp_A.match = midline_A
    hsp_B.match = midline_B

    hsp_A.identities = num_identities_A
    hsp_A.positives = num_identities_A
    hsp_B.identities = num_identities_B
    hsp_B.positives = num_identities_B

    hsp_A.gaps = num_gap_bases_A
    hsp_B.gaps = num_gap_bases_B

    # query in positive frame?
    if hsp.frame[0] > 0:
        hsp_A.query_start = hsp.query_start
        hsp_A.query_end = hsp_A.query_start + (hsp_A.align_length -
                                               hsp_A.query.count('-') - 1)

        hsp_B.query_end = hsp.query_end
        hsp_B.query_start = hsp.query_end - (hsp_B.align_length -
                                             hsp_B.query.count('-') - 1)

    else:
        raise RuntimeError(
            'Negative frame query?? We have not prepared for this situation!!!'
        )

    # subject in positive frame?
    if hsp.frame[1] > 0:
        hsp_A.sbjct_start = hsp.sbjct_start
        hsp_B.sbjct_end = hsp.sbjct_end

        hsp_A.sbjct_end = hsp_A.sbjct_start + (hsp_A.align_length -
                                               hsp_A.sbjct.count('-') - 1)
        hsp_B.sbjct_start = hsp.sbjct_end - (hsp_B.align_length -
                                             hsp_B.sbjct.count('-') - 1)

    else:
        hsp_A.sbjct_start = hsp.sbjct_start
        hsp_A.sbjct_end = hsp_A.sbjct_start - (hsp_A.align_length -
                                               hsp_A.sbjct.count('-') - 1)

        hsp_B.sbjct_end = hsp.sbjct_end
        hsp_B.sbjct_start = hsp.sbjct_end + (hsp_B.align_length -
                                             hsp_B.sbjct.count('-') - 1)

    # Find number of gap openings
    openings_A = count_gap_openings(hsp_A.query) + count_gap_openings(
        hsp_A.sbjct)
    openings_B = count_gap_openings(hsp_B.query) + count_gap_openings(
        hsp_B.sbjct)

    # Now calculate score, bitscore, and e-value
    hsp_A.score = blast_stats_calc.calculate_alignment_score(
        hsp_A.identities, num_mismatches_A, hsp_A.gaps, openings_A)
    hsp_B.score = blast_stats_calc.calculate_alignment_score(
        hsp_B.identities, num_mismatches_B, hsp_B.gaps, openings_B)
    hsp_A.bits = blast_stats_calc.calculate_bit_score_from_alignment_score(
        hsp_A.score)
    hsp_B.bits = blast_stats_calc.calculate_bit_score_from_alignment_score(
        hsp_B.score)
    hsp_A.expect = blast_stats_calc.calculate_e_value_from_bit_score(
        hsp_A.bits)
    hsp_B.expect = blast_stats_calc.calculate_e_value_from_bit_score(
        hsp_B.bits)

    hsp_A.frame = hsp.frame
    hsp_B.frame = hsp.frame

    if hsp_A.align_length == 0:
        hsp_A = None

    if hsp_B.align_length == 0:
        hsp_B = None

    return hsp_A, hsp_B
Example #13
0
def read_aligns(f, R):
    """
    read alignments
    parse all aligns to end of blast report or to next query sequence

    issue:
     now, genebank switched to using accession instead of gis
     and in blast there are accession.version identifier (and sometimes something different)

    """
    # parse_regexp = '[ACTGUactgu\-]+'
    parse_regexp_q = '[ACGTUKSYMWRBDHVNacgtuksymwrbdhvn-]+'
    eor = True
    txt = bread(f)  # get line after alignments
    # if txt[0] == '>':  # enter a record
    #     # create an alignment object if we are sure that there is one
    #     # start of parsing hits to single source sequence
    while txt and eor:
        if txt[:6] == 'Query=':
            break
        c = re.search(r'(?<=>) *[\S|.]+', txt)
        def_rem = bread(f)
        while not re.search('Length=', def_rem):
            txt += def_rem
            def_rem = bread(f)
        # get new alig object
        curr_alig = Record.Alignment()

        curr_alig.hit_id = c.group()
        curr_alig.hit_def = txt[c.end() + 1:]
        curr_alig.length = int(re.search(r'(?<=Length=)\d+', def_rem).group())

        # draw next line
        txt = bread(f)

        while txt and eor:
            # loop the hsps of the record
            if txt[0] == '>' or txt[:6] == 'Query=':
                # R.alignments.append(curr_alig)
                break

            curr_hsp = Record.HSP()

            while txt and eor:
                # parse scores
                if txt[:8] == " Score =":
                    # changing "score" to bits
                    # because the score is in "bits" units
                    curr_hsp.bits = float(
                        re.search(r'(?<=Score =) *\d+\.?\d*',
                                  txt).group(0).lstrip())
                    # if re.search('bits \(', txt):

                    # changing "bits" to score, because the number in brackets is match score (in nucleotide blast
                    #  same as number of identities)
                    curr_hsp.score = int(
                        re.search(r'(?<=bits \() *\d+(?=\))',
                                  txt).group(0).lstrip())
                    # the regexp for scientific format from here:
                    # http://stackoverflow.com/questions/18152597/extract-scientific-number-from-string
                    curr_hsp.expect = float(
                        re.search(
                            r'(?<=Expect =)-? *[0-9]+\.?[0-9]*(?:[Ee] *-? *[0-9]+)?',
                            txt).group(0).lstrip())
                elif txt[:13] == " Identities =":
                    # return only first int in identities field to mimic the XML parser

                    tmp_idtts = [
                        int(i) for i in re.search(
                            r'(?<=Identities =) *\d+/\d+(?= *\()', txt).group(
                                0).lstrip().split('/')
                    ]
                    tmp_gaps = [
                        int(i)
                        for i in re.search(r'(?<=Gaps =) *\d+/\d+(?= *\()',
                                           txt).group(0).lstrip().split('/')
                    ]
                    curr_hsp.identities = tmp_idtts[0]
                    curr_hsp.gaps = tmp_gaps[0]

                    assert tmp_gaps[1] == tmp_idtts[1]

                    curr_hsp.align_length = tmp_idtts[1]

                    del tmp_idtts
                    del tmp_gaps

                elif txt[:8] == " Strand=":
                    curr_hsp.strand = tuple(
                        re.search(r'(?<=Strand=)\S+',
                                  txt).group(0).rstrip().split('/'))
                elif txt[:10] == " Features ":
                    # must be enabled even if output isn't used, moves the pointer pas the features field
                    parsed_features = _parse_features(f, txt)
                    curr_hsp.features = parsed_features

                elif txt[:5] == "Query":
                    break

                else:
                    ml.debug("line ignored pos {} - {}".format(f.tell(), txt))

                # read next line
                txt = bread(f)

            # start of alignment (hsps) parse block
            # hsps starts with "Score ="
            qseq = ''
            mid = ''
            sseq = ''
            count = 1
            # while not re.search('Score =', txt):
            while txt[:8] != " Score =" or txt[:10] != " Features ":
                # Features in this part
                # Features flanking this part

                # parsing individual hps alignments
                # alignment may:
                #   1) continue with another line triple signaled by Query
                #   2) break
                #       a) continue with next hsps
                #       b) continue with next alignment (different source sequence (organism))
                #       c) be an EOF (this is not signaled) - tail was already parsed

                # read alignment
                # read alignment by triples
                if not txt[:5] == 'Query':
                    # this is eof (may not be)
                    while txt != '':
                        # if 'Database' in txt or 'database' in txt:
                        #     pass
                        if (txt[:6] == 'Query=') or (txt[0] == ">"):
                            # raise AssertionError('parser failed - unparsed record at %s', txt)
                            break
                        txt = bread(f)
                    # eor = False
                    break

                # get query start only at first instance
                if count == 1:
                    query_start = int(re.search(r'\d+', txt[5:]).group())

                # match query end at each instance
                query_end = int(re.search(r'\d+$', txt).group())

                # allow lowercase masking sequences
                # add 5 to prevent matching in Query
                q_info = re.search(parse_regexp_q, txt[5:])
                qseq += q_info.group()

                # get middle line
                txt = bread(f)
                mid += txt[q_info.start() + 5:q_info.end() + 5]

                txt = bread(f)
                sseq += txt[q_info.start() + 5:q_info.end() + 5]

                if count == 1:
                    subject_start = int(re.search(r'\d+', txt[5:]).group())

                subject_end = int(re.search(r'\d+$', txt).group())

                # go next
                txt = bread(f)
                count += 1
                if (txt[0] == '>') or (txt[:8] == " Score =") or (
                        txt[:6] == 'Query=') or txt[:10] == " Features ":
                    break

            # if end of iteration save current hsp and go for next one
            curr_hsp.query = qseq
            curr_hsp.match = mid
            curr_hsp.sbjct = sseq
            curr_hsp.query_start = query_start
            curr_hsp.query_end = query_end
            curr_hsp.sbjct_start = subject_start
            curr_hsp.sbjct_end = subject_end

            assert len(qseq) == len(mid) == len(sseq)

            # append hsps to current alignment
            curr_alig.hsps.append(curr_hsp)

        R.alignments.append(curr_alig)

    if not eor and len(R.alignments) == 0:
        R.alignments.append(curr_alig)
    if len(R.alignments) == 0:
        print('no hits parsed')
    return R, txt
Example #14
0
def _parse_blast_body(f, common_info):
    """
    run on open handle to file
    for each query return list of blast records

    web format differ from standalone export format
    standallone 2.2.8+
        output is segmented by query, each query is ended with KH report
        contain all queries, if no hit for given query: "***** No hits found *****"

    web:
        output is segmented by queries, new query is signaled only by "Query="
        if no hits are found, the query is not part of the report and not signaled in any way that that it was in the
         input
        if no hits are returned blast will not allow txt file download
    """

    # start the parse loop
    txt = bread(f)

    record_holder = Record.Blast()
    update_blast_record(record_holder, common_info)
    do_query_name = True
    do_query_length = True
    R = copy.deepcopy(record_holder)
    while txt != '':
        # parse query name
        if do_query_name and re.search('^Query=', txt):
            # query_name = re.search('(?<=Query=)\ *\S+', txt).group().lstrip()
            # it is possible, that no query name is provided (web)
            R.query = re.search(r'(?<=Query=)\s*\S*', txt).group().lstrip()
            do_query_name = False
            txt = bread(f)
            continue

        # parse query Lenght
        if do_query_length and re.search('^Length=', txt):
            R.query_length = int(
                re.search(r'(?<=Length=) *\d+', txt).group().lstrip())
            txt = bread(f)
            do_query_length = False
            continue

        # skip reference, database and table
        # skip table
        # parse hits to sequence
        if txt[:10] == 'ALIGNMENTS' or txt[0] == '>':
            # enter the alignments loop
            if txt[0] == '>':
                # seek before the line as the subsequent function expects it
                f.seek(f.tell() - len(txt) - 2)
            [R, txt] = read_aligns(f, R)
            # R.query_length = query_length
            # R.query = query_name
            # append blast record object to the temporary wrapper
            yield R
            R = copy.deepcopy(record_holder)
            do_query_name = True
            do_query_length = True
        elif re.search(r'\*{5}\sNo hits found\s\*{5}', txt) or re.search(
                "^Query=", txt):
            # new record
            yield R
            R = copy.deepcopy(record_holder)
            do_query_name = True
            do_query_length = True
            txt = bread(f)
        else:
            # get the next line
            txt = bread(f)