def _choose_best_target_alignment(self, context, interacting_chain_alignments, potential_target_sequences, chain_id): best_alignment = None for target_id in potential_target_sequences: template_chain_sequence = context.get_sequence(chain_id) template_chain_secstr = context.get_secondary_structure(chain_id) alignment = kmad_aligner.align( template_chain_sequence, template_chain_secstr, potential_target_sequences[target_id]) _log.debug("alignment {} has coverage {} %".format( alignment, alignment.get_percentage_coverage())) if alignment.get_percentage_coverage() < 90.0: # If the coverage is too low, we need to bother interpro. domain_alignments = \ domain_aligner.get_domain_alignments(potential_target_sequences[target_id], None, TemplateID(context.template_pdbid, chain_id)) interacting_alignments = list( filter( lambda ali: self._preserves_interactions( context, ali.target_alignment.replace('-', ''), chain_id, interacting_chain_alignments), domain_alignments)) _log.debug( "preserve interactions with chains {}: filtered {} alignments out of {}" .format(interacting_chain_alignments.keys(), len(interacting_alignments), len(domain_alignments))) if len(interacting_alignments) > 0: domain_alignment = self._join_alignments_to_best_template_coverage( interacting_alignments) elif len(domain_alignments) > 0: domain_alignment = self._join_alignments_to_best_template_coverage( domain_alignments) else: continue alignment = kmad_aligner.align( template_chain_sequence, template_chain_secstr, domain_alignment.get_target_sequence()) alignment.target_id = target_id if best_alignment is None or \ best_alignment.get_percentage_identity() < alignment.get_percentage_identity(): best_alignment = alignment return best_alignment
def _choose_best_target_alignment(self, context, interacting_chain_alignments, potential_target_sequences, chain_id): best_alignment = None for target_id in potential_target_sequences: template_chain_sequence = context.get_sequence(chain_id) template_chain_secstr = context.get_secondary_structure(chain_id) alignment = kmad_aligner.align(template_chain_sequence, template_chain_secstr, potential_target_sequences[target_id]) _log.debug("alignment {} has coverage {} %".format(alignment, alignment.get_percentage_coverage())) if alignment.get_percentage_coverage() < 90.0: # If the coverage is too low, we need to bother interpro. domain_alignments = \ domain_aligner.get_domain_alignments(potential_target_sequences[target_id], None, TemplateID(context.template_pdbid, chain_id)) interacting_alignments = list(filter(lambda ali: self._preserves_interactions(context, ali.target_alignment.replace('-', ''), chain_id, interacting_chain_alignments), domain_alignments)) _log.debug("preserve interactions with chains {}: filtered {} alignments out of {}" .format(interacting_chain_alignments.keys(), len(interacting_alignments), len(domain_alignments))) if len(interacting_alignments) > 0: domain_alignment = self._join_alignments_to_best_template_coverage(interacting_alignments) elif len(domain_alignments) > 0: domain_alignment = self._join_alignments_to_best_template_coverage(domain_alignments) else: continue alignment = kmad_aligner.align(template_chain_sequence, template_chain_secstr, domain_alignment.get_target_sequence()) alignment.target_id = target_id if best_alignment is None or \ best_alignment.get_percentage_identity() < alignment.get_percentage_identity(): best_alignment = alignment return best_alignment
def test_kmad_X(): target_seq = "AAAAAAAAAAAAAAA" template_seq = "XAXRXLXKXGDAFNR" template_secstr = " " aligned = kmad_aligner.align(template_seq, template_secstr, target_seq) eq_(aligned.template_alignment.replace('-', ''), template_seq)
def _clean_search_space(self, checked_ranges, sample_ranges, ok_ranges_alignments): # See if we can merge ranges that have # the same template in their blast hits: checked_ranges = self._remove_duplicate_ranges(checked_ranges + sample_ranges) sample_ranges = [] shared_hits_ranges = self._find_shared_hits_ranges(ok_ranges_alignments) for template_id in shared_hits_ranges: ranges = shared_hits_ranges[template_id] for i in range(len(ranges)): overlapping_indices = [] for j in range(len(ranges)): if j != i and ranges[j].overlaps_with(ranges[i]): overlapping_indices.append(j) for j in overlapping_indices: percentage_overlap = ranges[i].get_percentage_overlap(ranges[j]) percentage_length_difference = 100.0 * (abs(ranges[i].get_length() - ranges[j].get_length()) / max(ranges[i].get_length(), ranges[j].get_length())) merged = ranges[i].merge_with(ranges[j]) # Merge only if: # - the ranges are close together # - the merge has not already been done # - the intersecting parts of the ranges align to # the template in exactly the same way if merged not in checked_ranges: alignment_i = ok_ranges_alignments[ranges[i]] alignment_j = ok_ranges_alignments[ranges[j]] template_secstr = dssp.get_secondary_structure(template_id) template_sequence = dssp.get_sequence(template_id) try: alignment_m = kmad_aligner.align(template_sequence, template_secstr, merged.get_sub_sequence()) except: _log.warn(traceback.format_exc()) # If kmad fails, then skip this one :( continue intersected = ranges[i].get_intersection(ranges[j]) intersect_template_sequence_i = \ self._get_template_sequence_in_target_range(alignment_i, intersected - ranges[i].start) intersect_template_sequence_j = \ self._get_template_sequence_in_target_range(alignment_j, intersected - ranges[j].start) intersect_template_sequence_m = \ self._get_template_sequence_in_target_range(alignment_m, intersected - merged.start) if intersect_template_sequence_i == intersect_template_sequence_m and \ intersect_template_sequence_j == intersect_template_sequence_m: sample_ranges.append(merged) return sample_ranges
def test_kmad_X(): target_seq = "AAAAAAAAAAAAAAA" template_seq = "XAXRXLXKXGDAFNR" template_secstr = " " aligned = kmad_aligner.align(template_seq, template_secstr, target_seq) eq_(aligned.template_alignment.replace('-',''), template_seq)
def test_kmad(): target_seq = "AAACCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGSDYAN" template_seq = "TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN" template_secstr = " EE SSHHHHHHHHHHHTTT HHHHHHHHS EE SSS GGG " aligned = kmad_aligner.align(template_seq, template_secstr, target_seq) ok_(len(aligned.target_alignment) > 0) eq_(len(aligned.target_alignment), len(aligned.template_alignment))
def _get_hits(self, range_, template_id): if self.template_blast_databank is None: raise InitError("blast databank is not set") blast_hits = blaster.blastp(range_.get_sub_sequence(), self.template_blast_databank) _log.debug("{} blast hits to filter".format(len(blast_hits))) count_template_hits = 0 good_hits = [] for hit_id in blast_hits: for alignment in blast_hits[hit_id]: hit_template_id = TemplateID(alignment.get_hit_accession_code(), alignment.get_hit_chain_id()) if template_id is not None and hit_template_id != template_id: continue count_template_hits += 1 if template_id is None and blacklister.is_blacklisted(alignment.get_hit_accession_code()): continue if not dssp.has_secondary_structure(hit_template_id): continue # Replace the blast hit's alignment with the kmad alignment. template_secstr = dssp.get_secondary_structure(hit_template_id) template_sequence = dssp.get_sequence(hit_template_id) try: kmad_alignment = kmad_aligner.align(template_sequence, template_secstr, range_.get_sub_sequence()) except: _log.warn(traceback.format_exc()) # If kmad fails, then skip this one :( continue alignment.full_query_sequence = range_.sequence alignment.query_start = range_.start + 1 alignment.query_end = range_.end alignment.subject_start = 1 alignment.subject_end = len(template_sequence) alignment.query_alignment = kmad_alignment.target_alignment alignment.subject_alignment = kmad_alignment.template_alignment if alignment.get_percentage_identity() >= get_min_identity(alignment.count_aligned_residues()): good_hits.append(alignment) if count_template_hits == 0 and template_id is not None: _log.warning("domain sequence {} has no suitable hits with {}".format(range_.get_sub_sequence(), template_id)) return [] return good_hits
def _get_hits(self, range_, template_id): if self.template_blast_databank is None: raise InitError("blast databank is not set") blast_hits = blaster.blastp(range_.get_sub_sequence(), self.template_blast_databank) _log.debug("{} blast hits to filter".format(len(blast_hits))) good_hits = [] for hit_id in blast_hits: for alignment in blast_hits[hit_id]: hit_template_id = TemplateID(alignment.get_hit_accession_code(), alignment.get_hit_chain_id()) if template_id is not None and hit_template_id != template_id: continue if template_id is None and blacklister.is_blacklisted(alignment.get_hit_accession_code()): continue if not dssp.has_secondary_structure(hit_template_id): continue # Replace the blast hit's alignment with the kmad alignment. template_secstr = dssp.get_secondary_structure(hit_template_id) template_sequence = dssp.get_sequence(hit_template_id) try: kmad_alignment = kmad_aligner.align(template_sequence, template_secstr, range_.get_sub_sequence()) except: _log.warn(traceback.format_exc()) # If kmad fails, then skip this one :( continue alignment.full_query_sequence = range_.sequence alignment.query_start = range_.start + 1 alignment.query_end = range_.end alignment.subject_start = 1 alignment.subject_end = len(template_sequence) alignment.query_alignment = kmad_alignment.target_alignment alignment.subject_alignment = kmad_alignment.template_alignment if alignment.get_percentage_identity() >= get_min_identity(alignment.count_aligned_residues()): good_hits.append(alignment) return good_hits
def _preserves_interactions(self, context, candidate_target_segment, candidate_chain_id, interacting_chain_alignments): # The pdb file in the soup can be different from the blast hit # So make an alignment first! template_chain_sequence = context.get_sequence(candidate_chain_id) template_chain_secstr = context.get_secondary_structure( candidate_chain_id) candidate_alignment = kmad_aligner.align(template_chain_sequence, template_chain_secstr, candidate_target_segment) candidate_residue_indices = candidate_alignment.get_covered_template_residues_indices( ) candidate_residues = context.get_residues(candidate_chain_id) covered_candidate_residues = [ candidate_residues[i] for i in candidate_residue_indices ] for chain_id in interacting_chain_alignments: covered_template_residue_indices = \ interacting_chain_alignments[chain_id].get_covered_template_residues_indices() chain_residues = context.get_residues(chain_id) covered_residues = [ chain_residues[i] for i in covered_template_residue_indices ] _log.debug( "checking chain {} {} residues against chain {} {} residues for interaction" .format(candidate_chain_id, len(covered_candidate_residues), chain_id, len(covered_residues))) # Check every target-covered residue. # Return True if a single interacting residue pair is found: for candidate_residue in covered_candidate_residues: if context.residue_interacts_with(candidate_residue, covered_residues): return True return False
def _preserves_interactions(self, context, candidate_target_segment, candidate_chain_id, interacting_chain_alignments): # The pdb file in the soup can be different from the blast hit # So make an alignment first! template_chain_sequence = context.get_sequence(candidate_chain_id) template_chain_secstr = context.get_secondary_structure(candidate_chain_id) candidate_alignment = kmad_aligner.align(template_chain_sequence, template_chain_secstr, candidate_target_segment) candidate_residue_indices = candidate_alignment.get_covered_template_residues_indices() candidate_residues = context.get_residues(candidate_chain_id) covered_candidate_residues = [candidate_residues[i] for i in candidate_residue_indices] for chain_id in interacting_chain_alignments: covered_template_residue_indices = \ interacting_chain_alignments[chain_id].get_covered_template_residues_indices() chain_residues = context.get_residues(chain_id) covered_residues = [chain_residues[i] for i in covered_template_residue_indices] _log.debug("checking chain {} {} residues against chain {} {} residues for interaction" .format(candidate_chain_id, len(covered_candidate_residues), chain_id, len(covered_residues))) # Check every target-covered residue. # Return True if a single interacting residue pair is found: for candidate_residue in covered_candidate_residues: if context.residue_interacts_with(candidate_residue, covered_residues): return True return False
def pick_random_sequences(n): sprot_sequences = parse_fasta(SPROT_FASTA) keys = random.sample(sprot_sequences.keys(), n) return {key: sprot_sequences[key] for key in keys} sequences = pick_random_sequences(10) for key in sequences: while True: try: domain_alignments = domain_aligner.get_domain_alignments( sequences[key]) break except HTTPError: continue for domain_alignment in domain_alignments: template_seq = dssp.get_sequence(domain_alignment.template_id) template_secstr = dssp.get_secondary_structure( domain_alignment.template_id) full_alignment = kmad_aligner.align(template_seq, template_secstr, sequences[key]) print(key, domain_alignment.template_id, domain_alignment.get_percentage_identity(), full_alignment.get_percentage_identity())
def _make_alignments(self, main_target_sequence, target_species_id, main_domain_alignment, context, require_resnum): alignments = {} # Choose what chains to align the main_target_on main_target_chain_ids = self._pick_identical_chains(main_domain_alignment.template_id.chain_id, context) ModelLogger.get_current().add("using template chains {} for the main target sequence".format(main_target_chain_ids)) for chain_id in main_target_chain_ids: template_chain_sequence = context.get_sequence(chain_id) template_chain_secstr = context.get_secondary_structure(chain_id) local_alignment = kmad_aligner.align(template_chain_sequence, template_chain_secstr, main_domain_alignment.get_target_sequence()) alignments[chain_id] = DomainAlignment(local_alignment.target_alignment, local_alignment.template_alignment, main_domain_alignment.range, main_domain_alignment.template_id) alignments[chain_id].target_id = model_storage.get_sequence_id(main_target_sequence) if require_resnum is not None and \ not alignments[main_domain_alignment.template_id.chain_id].is_target_residue_covered(require_resnum): raise RuntimeError("Cannot align to chain {} so that residue {} is covered" .format(main_domain_alignment.template_id.chain_id, require_resnum)) # Try to find and align target sequences for interacting chains in the template, # while keeping in mind which residues interact and must thus be covered by the alignment. # We expand the set of involved template chains with every iteration, # until all template chains have been added. while len(alignments) < len(context.get_chain_ids()): # First, make python remember to which chains the candidate chains interact: candidate_chains_interacts_with = {} for aligned_chain_id in alignments: for interacting_chain_id in context.list_interacting_chains(aligned_chain_id): ModelLogger.get_current().add("template chain {} interacts with {}" .format(aligned_chain_id, interacting_chain_id)) # Skip those that we've already aligned, to prevent infinite loops: if interacting_chain_id in alignments: continue if interacting_chain_id not in candidate_chains_interacts_with: candidate_chains_interacts_with[interacting_chain_id] = [] candidate_chains_interacts_with[interacting_chain_id].append(aligned_chain_id) if len(candidate_chains_interacts_with) <= 0: break # Nothing more to add # iterate over chains that might interact with the chains that are already in the set: for candidate_chain_id in candidate_chains_interacts_with: interacting_chain_alignments = {interacting_chain_id: alignments[interacting_chain_id] for interacting_chain_id in candidate_chains_interacts_with[candidate_chain_id]} template_chain_sequence = context.get_sequence(candidate_chain_id) template_chain_secstr = context.get_secondary_structure(candidate_chain_id) potential_target_sequences = self._find_target_sequences(template_chain_sequence, target_species_id) ModelLogger.get_current().add("choosing target sequence for template chain {} from {}" .format(candidate_chain_id, potential_target_sequences.keys())) alignments[candidate_chain_id] = self._choose_best_target_alignment(context, interacting_chain_alignments, potential_target_sequences, candidate_chain_id) if alignments[candidate_chain_id] is None: alignments[candidate_chain_id] = self._make_poly_A_alignment(context, candidate_chain_id) alignments[candidate_chain_id].target_id = "poly-A" ModelLogger.get_current().add("found no target for template chain {}, placing poly-A" .format(candidate_chain_id)) return alignments
s = "" j = 0 for i in range(len(ref)): if ref[i].isalpha(): s += ins[j] j += 1 else: s += ref[i] return s sequence = "CWAVAVAVGNDGAVAVAVWC" secstr = "EEEEEEEE EEEEEEEE" target = "CWAVAVAVAVAVGGGGGGVAVAVAVAVWC" kmad_alignment = kmad_aligner.align(sequence, secstr, target) clustal_alignment = clustal_aligner.align({ 'template': sequence, 'target': target }) print 'kmad' print kmad_alignment.target_alignment print gap_equally(kmad_alignment.template_alignment, secstr) print kmad_alignment.template_alignment print 'clustal' print clustal_alignment.aligned_sequences['target'] print gap_equally(clustal_alignment.aligned_sequences['template'], secstr) print clustal_alignment.aligned_sequences['template']
domain_aligner.similar_ranges_min_overlap_percentage = SIMILAR_RANGES_MIN_OVERLAP_PERCENTAGE domain_aligner.similar_ranges_max_length_difference_percentage = SIMILAR_RANGES_MAX_LENGTH_DIFFERENCE_PERCENTAGE kmad_aligner.kmad_exe = KMAD_EXE blaster.blastp_exe = BLASTP_EXE def pick_random_sequences(n): sprot_sequences = parse_fasta(SPROT_FASTA) keys = random.sample(sprot_sequences.keys(), n) return {key:sprot_sequences[key] for key in keys} sequences = pick_random_sequences(10) for key in sequences: while True: try: domain_alignments = domain_aligner.get_domain_alignments(sequences[key]) break except HTTPError: continue for domain_alignment in domain_alignments: template_seq = dssp.get_sequence(domain_alignment.template_id) template_secstr = dssp.get_secondary_structure(domain_alignment.template_id) full_alignment = kmad_aligner.align(template_seq, template_secstr, sequences[key]) print(key, domain_alignment.template_id, domain_alignment.get_percentage_identity(), full_alignment.get_percentage_identity())
def gap_equally(ref, ins): s = "" j = 0 for i in range(len(ref)): if ref[i].isalpha(): s += ins[j] j += 1 else: s += ref[i] return s sequence = "CWAVAVAVGNDGAVAVAVWC" secstr = "EEEEEEEE EEEEEEEE" target = "CWAVAVAVAVAVGGGGGGVAVAVAVAVWC" kmad_alignment = kmad_aligner.align(sequence, secstr, target) clustal_alignment = clustal_aligner.align({'template': sequence, 'target': target}) print 'kmad' print kmad_alignment.target_alignment print gap_equally(kmad_alignment.template_alignment, secstr) print kmad_alignment.template_alignment print 'clustal' print clustal_alignment.aligned_sequences['target'] print gap_equally(clustal_alignment.aligned_sequences['template'], secstr) print clustal_alignment.aligned_sequences['template']