def extract_target(seed_ref, coordinate_ref): """ Extract a portion of the seed that aligns with the coordinate reference. :param seed_ref: seed reference (nucleotide sequence) :param coordinate_ref: coordinate reference (amino acid sequence) :return: subsequence of seed_ref that maps to coordinate_ref """ best_alignment = (-1000000, '', '', 0) for frame_index in range(3): seed_aminos = translate('-' * frame_index + seed_ref) aseed, acoord, score = align_it_aa(seed_aminos, coordinate_ref, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) best_alignment = max(best_alignment, (score, aseed, acoord, frame_index)) score, aseed, acoord, frame_index = best_alignment assert score >= len(coordinate_ref) // 2, score target = [] seed_index = -frame_index for s, c in zip(aseed, acoord): if s == '-': continue seed_index += 3 if c == '-': continue target.append(seed_ref[seed_index - 3:seed_index]) return ''.join(target)
def find_coord_pos(projects, coord_name, start_pos, end_pos): coord_seq = projects.getReference(coord_name) gap_open = 40 gap_extend = 10 use_terminal_gap_penalty = 1 highest_score = 0 best_match = None for ref_name in sorted(projects.getProjectSeeds('HCV')): if not ref_name.startswith('HCV-2'): continue ref_nuc_seq = projects.getReference(ref_name) for nuc_offset in range(3): ref_amino_seq = translate(ref_nuc_seq, nuc_offset) aligned_coord, aligned_ref, score = align_it_aa( coord_seq, ref_amino_seq, gap_open, gap_extend, use_terminal_gap_penalty) if score > highest_score: highest_score = score best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref) ref_name, nuc_offset, aligned_coord, aligned_ref = best_match coord_pos = ref_pos = 0 ref_start = ref_end = None for coord_amino, ref_amino in zip(aligned_coord, aligned_ref): if coord_amino != '-': coord_pos += 1 if ref_amino != '-': ref_pos += 1 if start_pos == coord_pos: ref_start = ref_pos * 3 - nuc_offset - 3 if coord_pos == end_pos: ref_end = ref_pos * 3 - nuc_offset return ref_name, ref_start, ref_end
def extract_target(seed_ref, coordinate_ref): """ Extract a portion of the seed that aligns with the coordinate reference. :param seed_ref: seed reference (nucleotide sequence) :param coordinate_ref: coordinate reference (amino acid sequence) :return: subsequence of seed_ref that maps to coordinate_ref """ best_alignment = (-1000000, '', '', 0) for frame_index in range(3): seed_aminos = translate('-'*frame_index + seed_ref) aseed, acoord, score = align_it_aa(seed_aminos, coordinate_ref, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) best_alignment = max(best_alignment, (score, aseed, acoord, frame_index)) score, aseed, acoord, frame_index = best_alignment assert score >= len(coordinate_ref) // 2, score target = [] seed_index = -frame_index for s, c in zip(aseed, acoord): if s == '-': continue seed_index += 3 if c == '-': continue target.append(seed_ref[seed_index-3:seed_index]) return ''.join(target)
def _pair_align(self, reference, query, gap_open=15, gap_extend=5, use_terminal_gap_penalty=1): """ Align a query sequence to a reference sequence. @return: (aligned_ref, aligned_query, score) """ aligned_ref, aligned_query, score = gotoh.align_it_aa( reference, query, gap_open, gap_extend, use_terminal_gap_penalty) return aligned_ref, aligned_query, score
def _pair_align(self, reference, query, gap_open=15, gap_extend=5, use_terminal_gap_penalty=1): """ Align a query sequence of amino acids to a reference sequence. @return: (aligned_ref, aligned_query, score) """ # noinspection PyUnresolvedReferences aligned_ref, aligned_query, score = gotoh.align_it_aa( reference, query, gap_open, gap_extend, use_terminal_gap_penalty) return aligned_ref, aligned_query, score
def align_it_aa(self, seqa, seqb, gap_ini, gap_ext, use_terminal_gap_penalty=False, emulate_rb=False): ''' Returns aligned sequences (with gaps) from the Gotoh algorithm. Expects amino acid sequences, see align_it() for nucleotide. Parameters: seqa (string): Amino acid sequence (standard) seqb (string): Another amino acid sequence (seq) gap_init (int): Gap initialization penalty gap_extend (int): Gap extension penalty use_terminal_gap_penalty (bool): penalize trailing gaps? emulate_rb (bool): use original (Ruby) match/mismatch scores? Returns: seqa (string): Aligned sequence a seqb (string): Aligned sequence b score (int): alignment score (gap penalties + match/mismatch) exit_status (AlignItResult): ok, illegal_char, internal_error ''' sa = "" sb = "" score = 0 al_status = AlignItResult.internal_error try: if not bool( self.valid_nu.search(seqa) and self.valid_nu.search(seqb)): al_status = AlignItResult.illegal_char else: if emulate_rb: [sa, sb] = gotoh.align_it_aa_rb(seqa, seqb, gap_ini, gap_ext) score = 0 else: [sa, sb, score] = gotoh.align_it_aa(seqa, seqb, gap_ini, gap_ext, int(use_terminal_gap_penalty)) al_status = AlignItResult.ok except: al_status = AlignItResult.internal_error return sa, sb, score, al_status
def find_coord_pos(projects: ProjectConfig, coord_name: str, start_pos: int = None, end_pos: int = None): coord_seq = projects.getReference(coord_name) if start_pos is None: start_pos = 1 if end_pos is None: end_pos = len(coord_seq) + 1 if projects.config['regions'][coord_name]['is_nucleotide']: # Already have a nucleotide sequence, nothing to do. return coord_name, start_pos, end_pos gap_open = 40 gap_extend = 10 use_terminal_gap_penalty = 1 highest_score = 0 best_match = None ref_names = set() for project in projects.config['projects'].values(): for region in project['regions']: if coord_name == region['coordinate_region']: ref_names.update(region['seed_region_names']) for ref_name in sorted(ref_names): ref_nuc_seq = projects.getReference(ref_name) for nuc_offset in range(3): ref_amino_seq = translate(ref_nuc_seq, nuc_offset) aligned_coord, aligned_ref, score = align_it_aa( coord_seq, ref_amino_seq, gap_open, gap_extend, use_terminal_gap_penalty) if score > highest_score: highest_score = score best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref) ref_name, nuc_offset, aligned_coord, aligned_ref = best_match coord_pos = ref_pos = 0 ref_start = ref_end = None for coord_amino, ref_amino in zip(aligned_coord, aligned_ref): if ref_amino != '-': ref_pos += 1 if coord_amino != '-': coord_pos += 1 if start_pos == coord_pos: ref_start = ref_pos * 3 - nuc_offset - 3 if coord_pos == end_pos: ref_end = ref_pos * 3 - nuc_offset assert ref_start is not None assert ref_end is not None return ref_name, ref_start, ref_end
def find_coord_pos(projects, coord_name, start_pos, end_pos): coord_seq = projects.getReference(coord_name) gap_open = 40 gap_extend = 10 use_terminal_gap_penalty = 1 highest_score = 0 best_match = None ref_names = set() for project in projects.config['projects'].values(): for region in project['regions']: if coord_name == region['coordinate_region']: ref_names.update(region['seed_region_names']) for ref_name in sorted(ref_names): ref_nuc_seq = projects.getReference(ref_name) for nuc_offset in range(3): ref_amino_seq = translate(ref_nuc_seq, nuc_offset) aligned_coord, aligned_ref, score = align_it_aa( coord_seq, ref_amino_seq, gap_open, gap_extend, use_terminal_gap_penalty) if score > highest_score: highest_score = score best_match = (ref_name, nuc_offset, aligned_coord, aligned_ref) ref_name, nuc_offset, aligned_coord, aligned_ref = best_match coord_pos = ref_pos = 0 ref_start = ref_end = None for coord_amino, ref_amino in zip(aligned_coord, aligned_ref): if ref_amino != '-': ref_pos += 1 if coord_amino != '-': coord_pos += 1 if start_pos == coord_pos: ref_start = ref_pos * 3 - nuc_offset - 3 if coord_pos == end_pos: ref_end = ref_pos * 3 - nuc_offset assert ref_start is not None assert ref_end is not None return ref_name, ref_start, ref_end
def find_best_sequence(sequences): best_match = (0, None, None) # (score, name, sequence) for name, nuc_seq in sorted(sequences.items()): nuc_seq = nuc_seq.replace('\n', '') nuc_seq = nuc_seq.replace('*', '-').replace('?', '-') for frame in range(3): offset_seq = '-' * frame + nuc_seq aa_seq = aln2counts.translate(offset_seq) aseq, aref, score = align_it_aa(aa_seq, PSSM_V3LOOP, GAP_OPEN_COST, GAP_EXTEND_COST, USE_TERMINAL_COST) best_match = max(best_match, (score, name, aseq, aref)) if score >= 272: pairs = zip(aseq, aref) diffs = [' ' if a == b else '*' for a, b in pairs] print('score', score, name) print('result ', aseq) print('diffs ', ''.join(diffs) if aseq != aref else 'no diffs') print('compare', aref) return best_match[:2]