Exemple #1
0
    def parse_ss_def_file(self, contents):
        '''This parser is forgiving and allows leading whitespace.'''
        mapping = {}
        for l in [l.strip() for l in contents.split('\n') if l.strip() and not(l.strip().startswith('#'))]:
            tokens = l.split()
            if len(tokens) != 2:
                raise RosettaFileParsingException('Lines in a secondary structure definition file must have exactly two entries.')

            positions = parse_range(tokens[0])
            ss = sorted(set(tokens[1].upper()))
            for p in positions:
                if mapping.get(p) and mapping[p] != ss:
                    raise RosettaFileParsingException('There are conflicting definitions for residue %d (%s and %s).' % (p, ''.join(mapping[p]), ''.join(ss)))
                mapping[p] = ss
        self.data = mapping
Exemple #2
0
    def end_document(self):
        assert(self.counters['entry'] == 1)

        residue_count = 0
        residues_matched = {}
        residues_encountered = set()
        atom_to_uniparc_residue_map = {}
        atom_to_seqres_residue_map = {}
        seqres_to_uniparc_residue_map = {}

        UniProtACs = set()
        for r in self.residues:
            if r.UniProtAC:
                UniProtACs.add(r.UniProtAC)

        ACC_to_UPARC_mapping = uniprot_map('ACC', 'UPARC', list(UniProtACs), cache_dir = self.cache_dir)
        assert(sorted(ACC_to_UPARC_mapping.keys()) == sorted(list(UniProtACs)))
        for k, v in ACC_to_UPARC_mapping.iteritems():
            assert(len(v) == 1)
            ACC_to_UPARC_mapping[k] = v[0]

        map_chains = set()
        for r in self.residues:
            if not(r.PDBResidueID.isalnum() and int(r.PDBResidueID.isalnum()) < 0):
                # These are not valid PDB residue IDs - the SIFTS XML convention sometimes assigns negative residue IDs to unobserved residues before the first ATOM record
                # (only if the first residue ID is 1?)
                pass

            # Store the PDB->UniProt mapping
            if r.has_pdb_to_uniprot_mapping():
                UniProtAC = r.UniProtAC
                UniParcID = ACC_to_UPARC_mapping[UniProtAC]
                self.uniparc_ids.add(UniParcID)

            full_pdb_residue_ID = r.get_pdb_residue_id()
            PDBChainID = r.PDBChainID
            map_chains.add(PDBChainID)
            residues_matched[PDBChainID] = residues_matched.get(PDBChainID, 0)

            if not r.WasNotObserved:
                # Do not add ATOM mappings when the ATOM data does not exist
                if r.has_pdb_to_uniprot_mapping():
                    atom_to_uniparc_residue_map[PDBChainID] = atom_to_uniparc_residue_map.get(PDBChainID, {})
                    atom_to_uniparc_residue_map[PDBChainID][full_pdb_residue_ID] = (UniParcID, r.UniProtResidueIndex)

                atom_to_seqres_residue_map[PDBChainID] = atom_to_seqres_residue_map.get(PDBChainID, {})
                atom_to_seqres_residue_map[PDBChainID][full_pdb_residue_ID] = r.PDBeResidueID

            if r.has_pdb_to_uniprot_mapping():
                seqres_to_uniparc_residue_map[PDBChainID] = seqres_to_uniparc_residue_map.get(PDBChainID, {})
                seqres_to_uniparc_residue_map[PDBChainID][r.PDBeResidueID] = (UniParcID, r.UniProtResidueIndex)

            # Make sure we only have at most one match per PDB residue
            assert(full_pdb_residue_ID not in residues_encountered)
            residues_encountered.add(full_pdb_residue_ID)

            # Count the number of exact sequence matches
            PDBResidue3AA = r.PDBResidue3AA
            pdb_residue_type = residue_type_3to1_map.get(PDBResidue3AA) or self.modified_residues.get(PDBResidue3AA) or protonated_residue_type_3to1_map.get(PDBResidue3AA) or non_canonical_amino_acids.get(PDBResidue3AA)
            if r.has_pdb_to_uniprot_mapping():
                if pdb_residue_type == r.UniProtResidue1AA:

                    residues_matched[PDBChainID] += 1
            residue_count += 1

        # Create the SequenceMaps
        for c in map_chains:
            if residues_matched[c] > 0:
                # 1IR3 has chains A,
                # Chain A has mappings from atom and seqres (PDBe) residues to UniParc as usual
                # Chain B (18 residues long) has mappings from atom to seqres residues but not to UniParc residues
                self.atom_to_uniparc_sequence_maps[c] = PDBUniParcSequenceMap.from_dict(atom_to_uniparc_residue_map[c])
                self.seqres_to_uniparc_sequence_maps[c] = PDBUniParcSequenceMap.from_dict(seqres_to_uniparc_residue_map[c])
            self.atom_to_seqres_sequence_maps[c] = SequenceMap.from_dict(atom_to_seqres_residue_map[c])

        # Check the match percentage
        total_residues_matched = sum([residues_matched[c] for c in residues_matched.keys()])
        if total_residues_matched == 0:
            if self.pdb_id and self.pdb_id in NoSIFTSPDBUniParcMappingCases:
                if self.require_uniprot_residue_mapping:
                    raise NoSIFTSPDBUniParcMapping('The PDB file %s has a bad or missing SIFTS mapping at the time of writing.' % self.pdb_id)
                else:
                    colortext.error('Warning: The PDB file %s has a a bad or missing SIFTS mapping at the time of writing so there is no PDB -> UniProt residue mapping.' % self.pdb_id)
            else:
                if self.require_uniprot_residue_mapping:
                    raise Exception('No residue information matching PDB residues to UniProt residues was found.')
                else:
                    colortext.error('Warning: No residue information matching PDB residues to UniProt residues was found.')
        else:
            percentage_matched = float(total_residues_matched)*100.0/float(residue_count)
            if percentage_matched < self.acceptable_sequence_percentage_match:
                if self.pdb_id and self.pdb_id in BadSIFTSMappingCases:
                    raise BadSIFTSMapping('The PDB file %s has a known bad SIFTS mapping at the time of writing.' % self.pdb_id)
                else:
                    raise Exception('Expected %.2f%% sequence match on matched residues but the SIFTS results only gave us %.2f%%.' % (self.acceptable_sequence_percentage_match, percentage_matched))

        # Merge the ranges for the region mappings i.e. so [1-3],[3-86] becomes [1-86]
        region_mapping = self.region_mapping
        for chain_id, chain_details in region_mapping.iteritems():
            for dbSource, source_details in chain_details.iteritems():
                for dbAccessionId, range_list in source_details.iteritems():
                    source_details[dbAccessionId] = merge_range_pairs(range_list)

        # Check to see if the expected numbering schemes hold
        for k, v in expected_residue_numbering_schemes.iteritems():
            if self.region_map_coordinate_systems.get(k):
                assert(self.region_map_coordinate_systems[k] == set([v]))

        pfam_scop_mapping = {}
        scop_pfam_mapping = {}
        for chain_id, chain_details in region_mapping.iteritems():
            if chain_details.get('Pfam') and chain_details.get('SCOP'):
                for pfamAccessionId, pfam_range_lists in chain_details['Pfam'].iteritems():
                    pfam_residues = parse_range(','.join(['%d-%d' % (r[0], r[1]) for r in pfam_range_lists]))
                    for scopAccessionId, scop_range_lists in chain_details['SCOP'].iteritems():
                        scop_residues = parse_range(','.join(['%d-%d' % (r[0], r[1]) for r in scop_range_lists]))
                        num_same_residues = len(set(pfam_residues).intersection(set(scop_residues)))
                        if num_same_residues > 10:
                            Pfam_match_quality = float(num_same_residues) / float(len(pfam_residues))
                            SCOP_match_quality = float(num_same_residues) / float(len(scop_residues))
                            if (Pfam_match_quality >= self.domain_overlap_cutoff) or (SCOP_match_quality >= self.domain_overlap_cutoff):
                                pfam_scop_mapping[pfamAccessionId] = pfam_scop_mapping.get(pfamAccessionId, DomainMatch(pfamAccessionId, 'Pfam'))
                                pfam_scop_mapping[pfamAccessionId].add(scopAccessionId, 'SCOP', SCOP_match_quality)
                                scop_pfam_mapping[scopAccessionId] = scop_pfam_mapping.get(scopAccessionId, DomainMatch(scopAccessionId, 'SCOP'))
                                scop_pfam_mapping[scopAccessionId].add(pfamAccessionId, 'Pfam', Pfam_match_quality)

        self.pfam_scop_mapping = pfam_scop_mapping
        self.scop_pfam_mapping = scop_pfam_mapping

        self._validate()