def parse_ss_def_file(self, contents): '''This parser is forgiving and allows leading whitespace.''' mapping = {} for l in [l.strip() for l in contents.split('\n') if l.strip() and not(l.strip().startswith('#'))]: tokens = l.split() if len(tokens) != 2: raise RosettaFileParsingException('Lines in a secondary structure definition file must have exactly two entries.') positions = parse_range(tokens[0]) ss = sorted(set(tokens[1].upper())) for p in positions: if mapping.get(p) and mapping[p] != ss: raise RosettaFileParsingException('There are conflicting definitions for residue %d (%s and %s).' % (p, ''.join(mapping[p]), ''.join(ss))) mapping[p] = ss self.data = mapping
def end_document(self): assert(self.counters['entry'] == 1) residue_count = 0 residues_matched = {} residues_encountered = set() atom_to_uniparc_residue_map = {} atom_to_seqres_residue_map = {} seqres_to_uniparc_residue_map = {} UniProtACs = set() for r in self.residues: if r.UniProtAC: UniProtACs.add(r.UniProtAC) ACC_to_UPARC_mapping = uniprot_map('ACC', 'UPARC', list(UniProtACs), cache_dir = self.cache_dir) assert(sorted(ACC_to_UPARC_mapping.keys()) == sorted(list(UniProtACs))) for k, v in ACC_to_UPARC_mapping.iteritems(): assert(len(v) == 1) ACC_to_UPARC_mapping[k] = v[0] map_chains = set() for r in self.residues: if not(r.PDBResidueID.isalnum() and int(r.PDBResidueID.isalnum()) < 0): # These are not valid PDB residue IDs - the SIFTS XML convention sometimes assigns negative residue IDs to unobserved residues before the first ATOM record # (only if the first residue ID is 1?) pass # Store the PDB->UniProt mapping if r.has_pdb_to_uniprot_mapping(): UniProtAC = r.UniProtAC UniParcID = ACC_to_UPARC_mapping[UniProtAC] self.uniparc_ids.add(UniParcID) full_pdb_residue_ID = r.get_pdb_residue_id() PDBChainID = r.PDBChainID map_chains.add(PDBChainID) residues_matched[PDBChainID] = residues_matched.get(PDBChainID, 0) if not r.WasNotObserved: # Do not add ATOM mappings when the ATOM data does not exist if r.has_pdb_to_uniprot_mapping(): atom_to_uniparc_residue_map[PDBChainID] = atom_to_uniparc_residue_map.get(PDBChainID, {}) atom_to_uniparc_residue_map[PDBChainID][full_pdb_residue_ID] = (UniParcID, r.UniProtResidueIndex) atom_to_seqres_residue_map[PDBChainID] = atom_to_seqres_residue_map.get(PDBChainID, {}) atom_to_seqres_residue_map[PDBChainID][full_pdb_residue_ID] = r.PDBeResidueID if r.has_pdb_to_uniprot_mapping(): seqres_to_uniparc_residue_map[PDBChainID] = seqres_to_uniparc_residue_map.get(PDBChainID, {}) seqres_to_uniparc_residue_map[PDBChainID][r.PDBeResidueID] = (UniParcID, r.UniProtResidueIndex) # Make sure we only have at most one match per PDB residue assert(full_pdb_residue_ID not in residues_encountered) residues_encountered.add(full_pdb_residue_ID) # Count the number of exact sequence matches PDBResidue3AA = r.PDBResidue3AA pdb_residue_type = residue_type_3to1_map.get(PDBResidue3AA) or self.modified_residues.get(PDBResidue3AA) or protonated_residue_type_3to1_map.get(PDBResidue3AA) or non_canonical_amino_acids.get(PDBResidue3AA) if r.has_pdb_to_uniprot_mapping(): if pdb_residue_type == r.UniProtResidue1AA: residues_matched[PDBChainID] += 1 residue_count += 1 # Create the SequenceMaps for c in map_chains: if residues_matched[c] > 0: # 1IR3 has chains A, # Chain A has mappings from atom and seqres (PDBe) residues to UniParc as usual # Chain B (18 residues long) has mappings from atom to seqres residues but not to UniParc residues self.atom_to_uniparc_sequence_maps[c] = PDBUniParcSequenceMap.from_dict(atom_to_uniparc_residue_map[c]) self.seqres_to_uniparc_sequence_maps[c] = PDBUniParcSequenceMap.from_dict(seqres_to_uniparc_residue_map[c]) self.atom_to_seqres_sequence_maps[c] = SequenceMap.from_dict(atom_to_seqres_residue_map[c]) # Check the match percentage total_residues_matched = sum([residues_matched[c] for c in residues_matched.keys()]) if total_residues_matched == 0: if self.pdb_id and self.pdb_id in NoSIFTSPDBUniParcMappingCases: if self.require_uniprot_residue_mapping: raise NoSIFTSPDBUniParcMapping('The PDB file %s has a bad or missing SIFTS mapping at the time of writing.' % self.pdb_id) else: colortext.error('Warning: The PDB file %s has a a bad or missing SIFTS mapping at the time of writing so there is no PDB -> UniProt residue mapping.' % self.pdb_id) else: if self.require_uniprot_residue_mapping: raise Exception('No residue information matching PDB residues to UniProt residues was found.') else: colortext.error('Warning: No residue information matching PDB residues to UniProt residues was found.') else: percentage_matched = float(total_residues_matched)*100.0/float(residue_count) if percentage_matched < self.acceptable_sequence_percentage_match: if self.pdb_id and self.pdb_id in BadSIFTSMappingCases: raise BadSIFTSMapping('The PDB file %s has a known bad SIFTS mapping at the time of writing.' % self.pdb_id) else: raise Exception('Expected %.2f%% sequence match on matched residues but the SIFTS results only gave us %.2f%%.' % (self.acceptable_sequence_percentage_match, percentage_matched)) # Merge the ranges for the region mappings i.e. so [1-3],[3-86] becomes [1-86] region_mapping = self.region_mapping for chain_id, chain_details in region_mapping.iteritems(): for dbSource, source_details in chain_details.iteritems(): for dbAccessionId, range_list in source_details.iteritems(): source_details[dbAccessionId] = merge_range_pairs(range_list) # Check to see if the expected numbering schemes hold for k, v in expected_residue_numbering_schemes.iteritems(): if self.region_map_coordinate_systems.get(k): assert(self.region_map_coordinate_systems[k] == set([v])) pfam_scop_mapping = {} scop_pfam_mapping = {} for chain_id, chain_details in region_mapping.iteritems(): if chain_details.get('Pfam') and chain_details.get('SCOP'): for pfamAccessionId, pfam_range_lists in chain_details['Pfam'].iteritems(): pfam_residues = parse_range(','.join(['%d-%d' % (r[0], r[1]) for r in pfam_range_lists])) for scopAccessionId, scop_range_lists in chain_details['SCOP'].iteritems(): scop_residues = parse_range(','.join(['%d-%d' % (r[0], r[1]) for r in scop_range_lists])) num_same_residues = len(set(pfam_residues).intersection(set(scop_residues))) if num_same_residues > 10: Pfam_match_quality = float(num_same_residues) / float(len(pfam_residues)) SCOP_match_quality = float(num_same_residues) / float(len(scop_residues)) if (Pfam_match_quality >= self.domain_overlap_cutoff) or (SCOP_match_quality >= self.domain_overlap_cutoff): pfam_scop_mapping[pfamAccessionId] = pfam_scop_mapping.get(pfamAccessionId, DomainMatch(pfamAccessionId, 'Pfam')) pfam_scop_mapping[pfamAccessionId].add(scopAccessionId, 'SCOP', SCOP_match_quality) scop_pfam_mapping[scopAccessionId] = scop_pfam_mapping.get(scopAccessionId, DomainMatch(scopAccessionId, 'SCOP')) scop_pfam_mapping[scopAccessionId].add(pfamAccessionId, 'Pfam', Pfam_match_quality) self.pfam_scop_mapping = pfam_scop_mapping self.scop_pfam_mapping = scop_pfam_mapping self._validate()