def _create_hsp(hid, qid, hspd):
    """Return a list of HSP objects from the given parsed HSP values (PRIVATE)."""
    frags = []
    # we are iterating over query_ranges, but hit_ranges works just as well
    for idx, qcoords in enumerate(hspd["query_ranges"]):
        # get sequences, create object
        hseqlist = hspd.get("hit")
        hseq = "" if hseqlist is None else hseqlist[idx]
        qseqlist = hspd.get("query")
        qseq = "" if qseqlist is None else qseqlist[idx]
        frag = HSPFragment(hid, qid, hit=hseq, query=qseq)
        # coordinates
        frag.query_start = qcoords[0]
        frag.query_end = qcoords[1]
        frag.hit_start = hspd["hit_ranges"][idx][0]
        frag.hit_end = hspd["hit_ranges"][idx][1]
        # alignment annotation
        try:
            aln_annot = hspd.get("aln_annotation", {})
            for key, value in aln_annot.items():
                frag.aln_annotation[key] = value[idx]
        except IndexError:
            pass
        # strands
        frag.query_strand = hspd["query_strand"]
        frag.hit_strand = hspd["hit_strand"]
        # and append the hsp object to the list
        if frag.aln_annotation.get("similarity") is not None:
            if "#" in frag.aln_annotation["similarity"]:
                frags.extend(_split_fragment(frag))
                continue
        # try to set frame if there are translation in the alignment
        if (len(frag.aln_annotation) > 1 or frag.query_strand == 0
                or ("vulgar_comp" in hspd
                    and re.search(_RE_TRANS, hspd["vulgar_comp"]))):
            _set_frame(frag)

        frags.append(frag)

    # if the query is protein, we need to change the hit and query sequences
    # from three-letter amino acid codes to one letter, and adjust their
    # coordinates accordingly
    if len(frags[0].aln_annotation) == 2:  # 2 annotations == protein query
        frags = _adjust_aa_seq(frags)

    hsp = HSP(frags)
    # set hsp-specific attributes
    for attr in (
            "score",
            "hit_split_codons",
            "query_split_codons",
            "model",
            "vulgar_comp",
            "cigar_comp",
            "alphabet",
    ):
        if attr in hspd:
            setattr(hsp, attr, hspd[attr])

    return hsp
Ejemplo n.º 2
0
def _create_hsp(hid, qid, hspd):
    """Returns a list of HSP objects from the given parsed HSP values."""
    frags = []
    # we are iterating over query_ranges, but hit_ranges works just as well
    for idx, qcoords in enumerate(hspd["query_ranges"]):
        # get sequences, create object
        hseqlist = hspd.get("hit")
        hseq = "" if hseqlist is None else hseqlist[idx]
        qseqlist = hspd.get("query")
        qseq = "" if qseqlist is None else qseqlist[idx]
        frag = HSPFragment(hid, qid, hit=hseq, query=qseq)
        # coordinates
        frag.query_start = qcoords[0]
        frag.query_end = qcoords[1]
        frag.hit_start = hspd["hit_ranges"][idx][0]
        frag.hit_end = hspd["hit_ranges"][idx][1]
        # alignment annotation
        try:
            aln_annot = hspd.get("aln_annotation", {})
            for key, value in aln_annot.items():
                frag.aln_annotation[key] = value[idx]
        except IndexError:
            pass
        # strands
        frag.query_strand = hspd["query_strand"]
        frag.hit_strand = hspd["hit_strand"]
        # and append the hsp object to the list
        if frag.aln_annotation.get("similarity") is not None:
            if "#" in frag.aln_annotation["similarity"]:
                frags.extend(_split_fragment(frag))
                continue
        # try to set frame if there are translation in the alignment
        if (
            len(frag.aln_annotation) > 1
            or frag.query_strand == 0
            or ("vulgar_comp" in hspd and re.search(_RE_TRANS, hspd["vulgar_comp"]))
        ):
            _set_frame(frag)

        frags.append(frag)

    # if the query is protein, we need to change the hit and query sequences
    # from three-letter amino acid codes to one letter, and adjust their
    # coordinates accordingly
    if len(frags[0].aln_annotation) == 2:  # 2 annotations == protein query
        frags = _adjust_aa_seq(frags)

    hsp = HSP(frags)
    # set hsp-specific attributes
    for attr in ("score", "hit_split_codons", "query_split_codons", "model", "vulgar_comp", "cigar_comp", "alphabet"):
        if attr in hspd:
            setattr(hsp, attr, hspd[attr])

    return hsp
Ejemplo n.º 3
0
def _create_hsp(hid, qid, hspd):
    """Returns a list of HSP objects from the given parsed HSP values."""
    frags = []
    # we are iterating over query_ranges, but hit_ranges works just as well
    for idx, qcoords in enumerate(hspd['query_ranges']):
        # get sequences, create object
        hseqlist = hspd.get('hit')
        hseq = '' if hseqlist is None else hseqlist[idx]
        qseqlist = hspd.get('query')
        qseq = '' if qseqlist is None else qseqlist[idx]
        frag = HSPFragment(hid, qid, hit=hseq, query=qseq)
        # coordinates
        frag.query_start = qcoords[0]
        frag.query_end = qcoords[1]
        frag.hit_start = hspd['hit_ranges'][idx][0]
        frag.hit_end = hspd['hit_ranges'][idx][1]
        # alignment annotation
        try:
            aln_annot = hspd.get('aln_annotation', {})
            for key, value in aln_annot.items():
                frag.aln_annotation[key] = value[idx]
        except IndexError:
            pass
        # strands
        frag.query_strand = hspd['query_strand']
        frag.hit_strand = hspd['hit_strand']
        # and append the hsp object to the list
        if frag.aln_annotation.get('homology') is not None:
            if '#' in frag.aln_annotation['homology']:
                frags.extend(_split_fragment(frag))
                continue
        # try to set frame if there are translation in the alignment
        if len(frag.aln_annotation) > 1 or \
            frag.query_strand == 0 or \
            ('vulgar_comp' in hspd and re.search(_RE_TRANS, hspd['vulgar_comp'])):
            _set_frame(frag)

        frags.append(frag)

    # if the query is protein, we need to change the hit and query sequences
    # from three-letter amino acid codes to one letter, and adjust their
    # coordinates accordingly
    if len(frags[0].aln_annotation) == 2:  # 2 annotations == protein query
        frags = _adjust_aa_seq(frags)

    hsp = HSP(frags)
    # set hsp-specific attributes
    for attr in ('score', 'hit_split_codons', 'query_split_codons', \
            'model', 'vulgar_comp', 'cigar_comp', 'alphabet'):
        if attr in hspd:
            setattr(hsp, attr, hspd[attr])

    return hsp
Ejemplo n.º 4
0
def _create_hsp(hid, qid, psl):
    # protein flag
    is_protein = _is_protein(psl)
    # strand
    #if query is protein, strand is 0
    if is_protein:
        qstrand = 0
    else:
        qstrand = 1 if psl['strand'][0] == '+' else -1
    # try to get hit strand, if it exists
    try:
        hstrand = 1 if psl['strand'][1] == '+' else -1
    except IndexError:
        hstrand = 1  # hit strand defaults to plus

    # query block starts
    qstarts = _reorient_starts(psl['qstarts'], \
            psl['blocksizes'], psl['qsize'], qstrand)
    # hit block starts
    if len(psl['strand']) == 2:
        hstarts = _reorient_starts(psl['tstarts'], \
                psl['blocksizes'], psl['tsize'], hstrand)
    else:
        hstarts = psl['tstarts']
    # set query and hit coords
    # this assumes each block has no gaps (which seems to be the case)
    assert len(qstarts) == len(hstarts) == len(psl['blocksizes'])
    query_range_all = zip(qstarts, [x + y for x, y in \
            zip(qstarts, psl['blocksizes'])])
    hit_range_all = zip(hstarts, [x + y for x, y in \
            zip(hstarts, psl['blocksizes'])])
    # check length of sequences and coordinates, all must match
    if 'tseqs' in psl and 'qseqs' in psl:
        assert len(psl['tseqs']) == len(psl['qseqs']) == \
                len(query_range_all) == len(hit_range_all)
    else:
        assert len(query_range_all) == len(hit_range_all)

    frags = []
    # iterating over query_range_all, but hit_range_all works just as well
    for idx, qcoords in enumerate(query_range_all):
        hseqlist = psl.get('tseqs')
        hseq = '' if not hseqlist else hseqlist[idx]
        qseqlist = psl.get('qseqs')
        qseq = '' if not qseqlist else qseqlist[idx]
        frag = HSPFragment(hid, qid, hit=hseq, query=qseq)
        # set alphabet
        frag.alphabet = generic_dna
        # set coordinates
        frag.query_start = qcoords[0]
        frag.query_end = qcoords[1]
        frag.hit_start = hit_range_all[idx][0]
        frag.hit_end = hit_range_all[idx][1]
        # and strands
        frag.query_strand = qstrand
        frag.hit_strand = hstrand
        frags.append(frag)

    # create hsp object
    hsp = HSP(frags)
    # check if start and end are set correctly
    assert hsp.query_start == psl['qstart']
    assert hsp.query_end == psl['qend']
    assert hsp.hit_start == psl['tstart']
    assert hsp.hit_end == psl['tend']
    # and check block spans as well
    assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes']
    # set its attributes
    hsp.match_num = psl['matches']
    hsp.mismatch_num = psl['mismatches']
    hsp.match_rep_num = psl['repmatches']
    hsp.n_num = psl['ncount']
    hsp.query_gapopen_num = psl['qnuminsert']
    hsp.query_gap_num = psl['qbaseinsert']
    hsp.hit_gapopen_num = psl['tnuminsert']
    hsp.hit_gap_num = psl['tbaseinsert']

    hsp.ident_num = psl['matches'] + psl['repmatches']
    hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert']
    hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert']
    hsp.query_is_protein = is_protein
    hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1
    hsp.score = _calc_score(psl, is_protein)
    # helper flag, for writing
    hsp._has_hit_strand = len(psl['strand']) == 2

    return hsp
Ejemplo n.º 5
0
def _create_hsp(hid, qid, psl):
    """Create high scoring pair object (PRIVATE)."""
    # protein flag
    is_protein = _is_protein(psl)
    # strand
    # if query is protein, strand is 0
    if is_protein:
        qstrand = 0
    else:
        qstrand = 1 if psl["strand"][0] == "+" else -1
    # try to get hit strand, if it exists
    try:
        hstrand = 1 if psl["strand"][1] == "+" else -1
    except IndexError:
        hstrand = 1  # hit strand defaults to plus

    blocksize_multiplier = 3 if is_protein else 1
    # query block starts
    qstarts = _reorient_starts(psl["qstarts"], psl["blocksizes"], psl["qsize"],
                               qstrand)
    # hit block starts
    if len(psl["strand"]) == 2:
        hstarts = _reorient_starts(
            psl["tstarts"],
            [blocksize_multiplier * i for i in psl["blocksizes"]],
            psl["tsize"],
            hstrand,
        )
    else:
        hstarts = psl["tstarts"]
    # set query and hit coords
    # this assumes each block has no gaps (which seems to be the case)
    assert len(qstarts) == len(hstarts) == len(psl["blocksizes"])
    query_range_all = list(
        zip(qstarts, [x + y for x, y in zip(qstarts, psl["blocksizes"])]))
    hit_range_all = list(
        zip(
            hstarts,
            [
                x + y * blocksize_multiplier
                for x, y in zip(hstarts, psl["blocksizes"])
            ],
        ))
    # check length of sequences and coordinates, all must match
    if "tseqs" in psl and "qseqs" in psl:
        assert (len(psl["tseqs"]) == len(psl["qseqs"]) == len(query_range_all)
                == len(hit_range_all))
    else:
        assert len(query_range_all) == len(hit_range_all)

    frags = []
    # iterating over query_range_all, but hit_range_all works just as well
    for idx, qcoords in enumerate(query_range_all):
        hseqlist = psl.get("tseqs")
        hseq = "" if not hseqlist else hseqlist[idx]
        qseqlist = psl.get("qseqs")
        qseq = "" if not qseqlist else qseqlist[idx]
        frag = HSPFragment(hid, qid, hit=hseq, query=qseq)
        # set alphabet
        frag.alphabet = generic_dna
        # set coordinates
        frag.query_start = qcoords[0]
        frag.query_end = qcoords[1]
        frag.hit_start = hit_range_all[idx][0]
        frag.hit_end = hit_range_all[idx][1]
        # and strands
        frag.query_strand = qstrand
        frag.hit_strand = hstrand
        frags.append(frag)

    # create hsp object
    hsp = HSP(frags)
    # check if start and end are set correctly
    assert hsp.query_start == psl["qstart"]
    assert hsp.query_end == psl["qend"]
    assert hsp.hit_start == psl["tstart"]
    assert hsp.hit_end == psl["tend"]
    # and check block spans as well
    hit_spans = [span / blocksize_multiplier for span in hsp.hit_span_all]
    assert hit_spans == hsp.query_span_all == psl["blocksizes"]
    # set its attributes
    hsp.match_num = psl["matches"]
    hsp.mismatch_num = psl["mismatches"]
    hsp.match_rep_num = psl["repmatches"]
    hsp.n_num = psl["ncount"]
    hsp.query_gapopen_num = psl["qnuminsert"]
    hsp.query_gap_num = psl["qbaseinsert"]
    hsp.hit_gapopen_num = psl["tnuminsert"]
    hsp.hit_gap_num = psl["tbaseinsert"]

    hsp.ident_num = psl["matches"] + psl["repmatches"]
    hsp.gapopen_num = psl["qnuminsert"] + psl["tnuminsert"]
    hsp.gap_num = psl["qbaseinsert"] + psl["tbaseinsert"]
    hsp.query_is_protein = is_protein
    hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1
    hsp.score = _calc_score(psl, is_protein)
    # helper flag, for writing
    hsp._has_hit_strand = len(psl["strand"]) == 2

    return hsp
Ejemplo n.º 6
0
def _create_hsp(hid, qid, psl):
    # protein flag
    is_protein = _is_protein(psl)
    # strand
    # if query is protein, strand is 0
    if is_protein:
        qstrand = 0
    else:
        qstrand = 1 if psl['strand'][0] == '+' else -1
    # try to get hit strand, if it exists
    try:
        hstrand = 1 if psl['strand'][1] == '+' else -1
    except IndexError:
        hstrand = 1  # hit strand defaults to plus

    # query block starts
    qstarts = _reorient_starts(psl['qstarts'], psl['blocksizes'], psl['qsize'],
                               qstrand)
    # hit block starts
    if len(psl['strand']) == 2:
        hstarts = _reorient_starts(psl['tstarts'], psl['blocksizes'],
                                   psl['tsize'], hstrand)
    else:
        hstarts = psl['tstarts']
    # set query and hit coords
    # this assumes each block has no gaps (which seems to be the case)
    assert len(qstarts) == len(hstarts) == len(psl['blocksizes'])
    query_range_all = list(
        zip(qstarts, [x + y for x, y in zip(qstarts, psl['blocksizes'])]))
    hit_range_all = list(
        zip(hstarts, [x + y for x, y in zip(hstarts, psl['blocksizes'])]))
    # check length of sequences and coordinates, all must match
    if 'tseqs' in psl and 'qseqs' in psl:
        assert len(psl['tseqs']) == len(psl['qseqs']) == \
                len(query_range_all) == len(hit_range_all)
    else:
        assert len(query_range_all) == len(hit_range_all)

    frags = []
    # iterating over query_range_all, but hit_range_all works just as well
    for idx, qcoords in enumerate(query_range_all):
        hseqlist = psl.get('tseqs')
        hseq = '' if not hseqlist else hseqlist[idx]
        qseqlist = psl.get('qseqs')
        qseq = '' if not qseqlist else qseqlist[idx]
        frag = HSPFragment(hid, qid, hit=hseq, query=qseq)
        # set alphabet
        frag.alphabet = generic_dna
        # set coordinates
        frag.query_start = qcoords[0]
        frag.query_end = qcoords[1]
        frag.hit_start = hit_range_all[idx][0]
        frag.hit_end = hit_range_all[idx][1]
        # and strands
        frag.query_strand = qstrand
        frag.hit_strand = hstrand
        frags.append(frag)

    # create hsp object
    hsp = HSP(frags)
    # check if start and end are set correctly
    assert hsp.query_start == psl['qstart']
    assert hsp.query_end == psl['qend']
    assert hsp.hit_start == psl['tstart']
    assert hsp.hit_end == psl['tend']
    # and check block spans as well
    assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes']
    # set its attributes
    hsp.match_num = psl['matches']
    hsp.mismatch_num = psl['mismatches']
    hsp.match_rep_num = psl['repmatches']
    hsp.n_num = psl['ncount']
    hsp.query_gapopen_num = psl['qnuminsert']
    hsp.query_gap_num = psl['qbaseinsert']
    hsp.hit_gapopen_num = psl['tnuminsert']
    hsp.hit_gap_num = psl['tbaseinsert']

    hsp.ident_num = psl['matches'] + psl['repmatches']
    hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert']
    hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert']
    hsp.query_is_protein = is_protein
    hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1
    hsp.score = _calc_score(psl, is_protein)
    # helper flag, for writing
    hsp._has_hit_strand = len(psl['strand']) == 2

    return hsp