def realign_variable(self, germline_gene, match=3, mismatch=-2, gap_open_penalty=22, gap_extend_penalty=1): ''' Due to restrictions on the available scoring parameters in BLASTn, incorrect truncation of the v-gene alignment can occur. This function re-aligns the query sequence with the identified germline variable gene using more appropriate alignment parameters. Input is the name of the germline variable gene (ex: 'IGHV1-2*02'). ''' # self.germline_seq = self._get_germline_sequence_for_realignment(germline_gene, 'V') alignment = local_alignment(self.seq.sequence, self.germline_seq, match=match, mismatch=mismatch, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty) rc = self.seq.reverse_complement alignment_rc = local_alignment(rc, self.germline_seq, match=match, mismatch=mismatch, gap_open_penalty=gap_open_penalty, gap_extend_penalty=gap_extend_penalty) if alignment.score > alignment_rc.score: self._process_realignment(alignment) else: self.strand = 'minus' self.input_sequence = rc self._process_realignment(alignment_rc)
def _fallback_find_junc_nt_start(self, antibody): self.fallback_5prime = True # get the FR3 nt sequence of the IMGT gapped germline germ_fr3_sequence = antibody.v.imgt_germline.gapped_nt_sequence[196:309].replace('.', '') antibody.log('GERM FR3 SEQUENCE:', germ_fr3_sequence) # find the start of the junction (immediately after the end of FR3) aln = local_alignment(antibody.oriented_input, germ_fr3_sequence) antibody.log(' QUERY: ', aln.aligned_query) antibody.log(' ', aln.alignment_midline) antibody.log('GERM FR3:', aln.aligned_target) fr3_end = aln.query_end + (len(germ_fr3_sequence) - aln.target_end) junc_start_codon = antibody.oriented_input[fr3_end:fr3_end + 3] antibody.log('JUNC START:', junc_start_codon, codons[junc_start_codon], fr3_end) return fr3_end
def _fallback_find_junc_nt_end(self, antibody): self.fallback_3prime = True # need to find the start of FR4 in the IMGT germline sequence end_res = 'W' if antibody.chain == 'heavy' else 'F' for i, res in enumerate(antibody.j.imgt_germline.ungapped_aa_sequence): if res == end_res and end_res not in antibody.j.imgt_germline.ungapped_aa_sequence[i + 1:]: fr4_nt_start_pos = (antibody.j.imgt_germline.coding_start - 1) + (i * 3) break germ_fr4_sequence = antibody.j.imgt_germline.gapped_nt_sequence[fr4_nt_start_pos:] # find the end of the junction (end of the first codon of FR4) aln = local_alignment(antibody.oriented_input, germ_fr4_sequence) fr4_start = aln.query_begin - aln.target_begin junc_end_codon = antibody.oriented_input[fr4_start:fr4_start + 3] antibody.log('JUNC END:', junc_end_codon, codons[junc_end_codon], fr4_start) return fr4_start + 3
def realign_germline(self, antibody, query_start=None, query_end=None): ''' Due to restrictions on the available scoring parameters in BLASTn, incorrect truncation of the v-gene alignment can occur. This function re-aligns the query sequence with the identified germline variable gene using more appropriate alignment parameters. Args: oriented_input (str): the raw input sequence, correctly oriented query_start (int): 5' position in `oriented_input` at which the sequence should be truncated prior to alignment with the germline sequence. query_end (int): 3' position in `oriented_input` at which the seqeunce should be truncated prior to alignment with the germline sequence ''' oriented_input = antibody.oriented_input germline_seq = self._get_germline_sequence_for_realignment() aln_params = self._realignment_scoring_params(self.gene_type) # if the alignment start/end positions have been annotated by the assigner, # force re-alignment using those parameters if all([x is not None for x in [self.query_start, self.query_end, self.germline_start, self.germline_end]]): query = oriented_input.sequence[self.query_start:self.query_end] germline = germline_seq[self.germline_start:self.germline_end] alignment = global_alignment(query, germline, **aln_params) # use local alignment to determine alignment start/end positions if # they haven't already been determined by the assigner else: query = oriented_input.sequence[query_start:query_end] alignment = local_alignment(query, germline_seq, **aln_params) if alignment: self._process_realignment(antibody, alignment, query_start) else: antibody.log('GERMLINE REALIGNMENT ERROR') antibody.log('REALIGNMENT QUERY SEQUENCE:', query) antibody.log('QUERY START:', query_start) antibody.log('QUERN END:', query_end)
def assign_d(seq, species): ''' Identifies the germline diversity gene for a given sequence. Alignment is performed using the ssw_wrap.Aligner.align function. Input is a junction sequence (as a string) and the species of origin. Output is a DiversityResult object. ''' mod_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) db_file = os.path.join(mod_dir, 'ssw/dbs/{}_D.fasta'.format(species.lower())) db_handle = open(db_file, 'r') germs = [Sequence(s) for s in SeqIO.parse(db_handle, 'fasta')] rc_germs = [Sequence(s.reverse_complement, id=s.id) for s in germs] germs.extend(rc_germs) db_handle.close() alignments = local_alignment(seq, targets=germs, gap_open_penalty=20, gap_extend_penalty=2) alignments.sort(key=lambda x: x.score, reverse=True) try: return blast.DiversityResult(seq, alignments[:5]) except IndexError: return None
def gapped_imgt_realignment(self): ''' Aligns to gapped IMGT germline sequence. Used to determine IMGT-formatted position numberings so that identifying antibody regions is simplified. ''' self.imgt_germline = get_imgt_germlines(species=self.species, gene_type=self.gene_type, gene=self.full) query = self.germline_alignment.replace('-', '') aln_params = self._realignment_scoring_params(self.gene_type) aln_params['gap_open'] = -11 aln_matrix = self._get_gapped_imgt_substitution_matrix() self.imgt_gapped_alignment = local_alignment(query, self.imgt_germline.gapped_nt_sequence, matrix=aln_matrix, **aln_params) self.alignment_reading_frame = ((2 * (self.imgt_gapped_alignment.target_begin % 3)) % 3) + (self.imgt_germline.coding_start - 1) # IMGT coding start is 1-based self.coding_region = self._get_coding_region() self.aa_sequence = self._get_aa_sequence() try: self._imgt_numbering() except: self.exception('IMGT NUMBERING', traceback.format_exc(), sep='\n')
def _find_junction_nt_end(self, vdj): fr4 = vdj.j.regions.nt_seqs['FR4'][3:] aln = local_alignment(fr4, vdj.vdj_nt) if aln: return aln.target_begin
def _find_junction_nt_start(self, vdj): fr3 = vdj.v.regions.nt_seqs['FR3'][:-3] aln = local_alignment(fr3, vdj.vdj_nt) if aln: return aln.target_end + 1
def _get_d_start_position_nt(self, vdj): a = local_alignment(self.d_nt, self.cdr3_nt, gap_open_penalty=22, gap_extend_penalty=1) d_start = a.target_begin return d_start
def _get_isotype_query_region(self, antibody): aln = local_alignment(antibody.vdj_nt, antibody.oriented_input) return antibody.oriented_input[aln.target_end:]
def _get_alignments(self, antibody, isotype_seqs): query_region = self._get_isotype_query_region(antibody) alignments = local_alignment(query_region, targets=isotype_seqs, gap_open_penalty=22, gap_extend_penalty=1) return sorted(alignments, key=lambda x: x.score, reverse=True)
def _get_isotype_query_region(self, vdj): aln = local_alignment(vdj.vdj_nt, vdj.raw_input) return vdj.raw_input[aln.target_end:]
def _get_alignments(self, vdj, isotype_seqs): query_region = self._get_isotype_query_region(vdj) alignments = local_alignment(query_region, targets=isotype_seqs, gap_open_penalty=22, gap_extend_penalty=1) return sorted(alignments, key=lambda x: x.score, reverse=True)