Ejemplo n.º 1
0
def extract_target(seed_ref, coordinate_ref):
    """ Extract a portion of the seed that aligns with the coordinate reference.

    :param seed_ref: seed reference (nucleotide sequence)
    :param coordinate_ref: coordinate reference (amino acid sequence)
    :return: subsequence of seed_ref that maps to coordinate_ref
    """
    best_alignment = (-1000000, '', '', 0)
    for frame_index in range(3):
        seed_aminos = translate('-' * frame_index + seed_ref)
        aseed, acoord, score = align_it_aa(seed_aminos, coordinate_ref,
                                           GAP_OPEN_COST, GAP_EXTEND_COST,
                                           USE_TERMINAL_COST)
        best_alignment = max(best_alignment,
                             (score, aseed, acoord, frame_index))
    score, aseed, acoord, frame_index = best_alignment
    assert score >= len(coordinate_ref) // 2, score

    target = []
    seed_index = -frame_index
    for s, c in zip(aseed, acoord):
        if s == '-':
            continue
        seed_index += 3
        if c == '-':
            continue
        target.append(seed_ref[seed_index - 3:seed_index])
    return ''.join(target)
Ejemplo n.º 2
0
def find_coord_pos(projects, coord_name, start_pos, end_pos):
    coord_seq = projects.getReference(coord_name)
    gap_open = 40
    gap_extend = 10
    use_terminal_gap_penalty = 1
    highest_score = 0
    best_match = None
    for ref_name in sorted(projects.getProjectSeeds('HCV')):
        if not ref_name.startswith('HCV-2'):
            continue
        ref_nuc_seq = projects.getReference(ref_name)
        for nuc_offset in range(3):
            ref_amino_seq = translate(ref_nuc_seq, nuc_offset)
            aligned_coord, aligned_ref, score = align_it_aa(
                coord_seq, ref_amino_seq, gap_open, gap_extend,
                use_terminal_gap_penalty)
            if score > highest_score:
                highest_score = score
                best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref)
    ref_name, nuc_offset, aligned_coord, aligned_ref = best_match
    coord_pos = ref_pos = 0
    ref_start = ref_end = None
    for coord_amino, ref_amino in zip(aligned_coord, aligned_ref):
        if coord_amino != '-':
            coord_pos += 1
        if ref_amino != '-':
            ref_pos += 1
        if start_pos == coord_pos:
            ref_start = ref_pos * 3 - nuc_offset - 3
        if coord_pos == end_pos:
            ref_end = ref_pos * 3 - nuc_offset
    return ref_name, ref_start, ref_end
Ejemplo n.º 3
0
def extract_target(seed_ref, coordinate_ref):
    """ Extract a portion of the seed that aligns with the coordinate reference.

    :param seed_ref: seed reference (nucleotide sequence)
    :param coordinate_ref: coordinate reference (amino acid sequence)
    :return: subsequence of seed_ref that maps to coordinate_ref
    """
    best_alignment = (-1000000, '', '', 0)
    for frame_index in range(3):
        seed_aminos = translate('-'*frame_index + seed_ref)
        aseed, acoord, score = align_it_aa(seed_aminos,
                                           coordinate_ref,
                                           GAP_OPEN_COST,
                                           GAP_EXTEND_COST,
                                           USE_TERMINAL_COST)
        best_alignment = max(best_alignment, (score, aseed, acoord, frame_index))
    score, aseed, acoord, frame_index = best_alignment
    assert score >= len(coordinate_ref) // 2, score

    target = []
    seed_index = -frame_index
    for s, c in zip(aseed, acoord):
        if s == '-':
            continue
        seed_index += 3
        if c == '-':
            continue
        target.append(seed_ref[seed_index-3:seed_index])
    return ''.join(target)
Ejemplo n.º 4
0
    def _pair_align(self, reference, query, gap_open=15, gap_extend=5, use_terminal_gap_penalty=1):
        """ Align a query sequence to a reference sequence.

        @return: (aligned_ref, aligned_query, score)
        """
        aligned_ref, aligned_query, score = gotoh.align_it_aa(
            reference,
            query,
            gap_open,
            gap_extend,
            use_terminal_gap_penalty)
        return aligned_ref, aligned_query, score
Ejemplo n.º 5
0
    def _pair_align(self, reference, query, gap_open=15, gap_extend=5, use_terminal_gap_penalty=1):
        """ Align a query sequence of amino acids to a reference sequence.

        @return: (aligned_ref, aligned_query, score)
        """
        # noinspection PyUnresolvedReferences
        aligned_ref, aligned_query, score = gotoh.align_it_aa(
            reference,
            query,
            gap_open,
            gap_extend,
            use_terminal_gap_penalty)
        return aligned_ref, aligned_query, score
Ejemplo n.º 6
0
    def _pair_align(self, reference, query, gap_open=15, gap_extend=5, use_terminal_gap_penalty=1):
        """ Align a query sequence of amino acids to a reference sequence.

        @return: (aligned_ref, aligned_query, score)
        """
        # noinspection PyUnresolvedReferences
        aligned_ref, aligned_query, score = gotoh.align_it_aa(
            reference,
            query,
            gap_open,
            gap_extend,
            use_terminal_gap_penalty)
        return aligned_ref, aligned_query, score
Ejemplo n.º 7
0
    def align_it_aa(self,
                    seqa,
                    seqb,
                    gap_ini,
                    gap_ext,
                    use_terminal_gap_penalty=False,
                    emulate_rb=False):
        '''
        Returns aligned sequences (with gaps) from the Gotoh algorithm.
        Expects amino acid sequences, see align_it() for nucleotide.

                Parameters:
                        seqa (string): Amino acid sequence (standard)
                        seqb (string): Another amino acid sequence (seq)
                        gap_init (int): Gap initialization penalty
                        gap_extend (int): Gap extension penalty
                        use_terminal_gap_penalty (bool): penalize trailing gaps?
                        emulate_rb (bool): use original (Ruby) match/mismatch scores?

                Returns:
                        seqa (string): Aligned sequence a
                        seqb (string): Aligned sequence b
                        score (int): alignment score (gap penalties + match/mismatch)
                        exit_status (AlignItResult): ok, illegal_char, internal_error
        '''
        sa = ""
        sb = ""
        score = 0
        al_status = AlignItResult.internal_error

        try:
            if not bool(
                    self.valid_nu.search(seqa) and self.valid_nu.search(seqb)):
                al_status = AlignItResult.illegal_char
            else:
                if emulate_rb:
                    [sa, sb] = gotoh.align_it_aa_rb(seqa, seqb, gap_ini,
                                                    gap_ext)
                    score = 0
                else:
                    [sa, sb,
                     score] = gotoh.align_it_aa(seqa, seqb, gap_ini, gap_ext,
                                                int(use_terminal_gap_penalty))
                al_status = AlignItResult.ok
        except:
            al_status = AlignItResult.internal_error

        return sa, sb, score, al_status
Ejemplo n.º 8
0
def find_coord_pos(projects: ProjectConfig,
                   coord_name: str,
                   start_pos: int = None,
                   end_pos: int = None):
    coord_seq = projects.getReference(coord_name)
    if start_pos is None:
        start_pos = 1
    if end_pos is None:
        end_pos = len(coord_seq) + 1
    if projects.config['regions'][coord_name]['is_nucleotide']:
        # Already have a nucleotide sequence, nothing to do.
        return coord_name, start_pos, end_pos
    gap_open = 40
    gap_extend = 10
    use_terminal_gap_penalty = 1
    highest_score = 0
    best_match = None
    ref_names = set()
    for project in projects.config['projects'].values():
        for region in project['regions']:
            if coord_name == region['coordinate_region']:
                ref_names.update(region['seed_region_names'])

    for ref_name in sorted(ref_names):
        ref_nuc_seq = projects.getReference(ref_name)
        for nuc_offset in range(3):
            ref_amino_seq = translate(ref_nuc_seq, nuc_offset)
            aligned_coord, aligned_ref, score = align_it_aa(
                coord_seq, ref_amino_seq, gap_open, gap_extend,
                use_terminal_gap_penalty)
            if score > highest_score:
                highest_score = score
                best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref)
    ref_name, nuc_offset, aligned_coord, aligned_ref = best_match
    coord_pos = ref_pos = 0
    ref_start = ref_end = None
    for coord_amino, ref_amino in zip(aligned_coord, aligned_ref):
        if ref_amino != '-':
            ref_pos += 1
        if coord_amino != '-':
            coord_pos += 1
            if start_pos == coord_pos:
                ref_start = ref_pos * 3 - nuc_offset - 3
            if coord_pos == end_pos:
                ref_end = ref_pos * 3 - nuc_offset
    assert ref_start is not None
    assert ref_end is not None
    return ref_name, ref_start, ref_end
Ejemplo n.º 9
0
def find_coord_pos(projects, coord_name, start_pos, end_pos):
    coord_seq = projects.getReference(coord_name)
    gap_open = 40
    gap_extend = 10
    use_terminal_gap_penalty = 1
    highest_score = 0
    best_match = None
    ref_names = set()
    for project in projects.config['projects'].values():
        for region in project['regions']:
            if coord_name == region['coordinate_region']:
                ref_names.update(region['seed_region_names'])

    for ref_name in sorted(ref_names):
        ref_nuc_seq = projects.getReference(ref_name)
        for nuc_offset in range(3):
            ref_amino_seq = translate(ref_nuc_seq, nuc_offset)
            aligned_coord, aligned_ref, score = align_it_aa(
                coord_seq,
                ref_amino_seq,
                gap_open,
                gap_extend,
                use_terminal_gap_penalty)
            if score > highest_score:
                highest_score = score
                best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref)
    ref_name, nuc_offset, aligned_coord, aligned_ref = best_match
    coord_pos = ref_pos = 0
    ref_start = ref_end = None
    for coord_amino, ref_amino in zip(aligned_coord, aligned_ref):
        if ref_amino != '-':
            ref_pos += 1
        if coord_amino != '-':
            coord_pos += 1
            if start_pos == coord_pos:
                ref_start = ref_pos * 3 - nuc_offset - 3
            if coord_pos == end_pos:
                ref_end = ref_pos * 3 - nuc_offset
    assert ref_start is not None
    assert ref_end is not None
    return ref_name, ref_start, ref_end
Ejemplo n.º 10
0
def find_best_sequence(sequences):
    best_match = (0, None, None)  # (score, name, sequence)
    for name, nuc_seq in sorted(sequences.items()):
        nuc_seq = nuc_seq.replace('\n', '')
        nuc_seq = nuc_seq.replace('*', '-').replace('?', '-')
        for frame in range(3):
            offset_seq = '-' * frame + nuc_seq
            aa_seq = aln2counts.translate(offset_seq)
            aseq, aref, score = align_it_aa(aa_seq, PSSM_V3LOOP, GAP_OPEN_COST,
                                            GAP_EXTEND_COST, USE_TERMINAL_COST)
            best_match = max(best_match, (score, name, aseq, aref))

            if score >= 272:
                pairs = zip(aseq, aref)
                diffs = [' ' if a == b else '*' for a, b in pairs]
                print('score', score, name)
                print('result ', aseq)
                print('diffs  ',
                      ''.join(diffs) if aseq != aref else 'no diffs')
                print('compare', aref)
    return best_match[:2]