def families(params='published params.csv', sanity_file=None): '''Return a dictionary of all families in the dataset.''' # Map PDBIDs to paths of structures stru_path_list = glob.glob('structures/aligned_*.pdb') match_str = r'structures[/\\]aligned_(....)\.pdb' pdbids = [re.match(match_str, path).group(1) \ for path in stru_path_list] stru_path = CIDict(zip(pdbids, stru_path_list)) # Map PDBIDS to paths of multiple sequence alignments msa_path_list = glob.glob('gonnet aligned/* with *.clu') match_str = r'gonnet aligned[/\\](....) with .*\.clu' pdbids = [re.match(match_str, path).group(1) for path in msa_path_list] msa_path = CIDict(zip(pdbids, msa_path_list)) # Map PDBIDs to names of sequences of the structure template_id = CIDict((pdbid, 'template_' + pdbid.upper()) for pdbid in msa_path.keys()) if sanity_file is not None: with open(sanity_file, 'w') as f: f.write(repr(stru_path)) f.write('\n') f.write(repr(msa_path)) f.write('\n') f.write(repr(template_id)) # Create the families return CIDict((pdbid, Family(pdbid, stru_path[pdbid], msa_path[pdbid], template_id[pdbid], params)) for pdbid in msa_path.keys())
def create_session(workingdir, load = True): # Load structures: groupdict = CIDict(groups_from_folder(workingdir + '/structures', ['aligned_(.*).pdb'], load = load)) # Remove structures not in the datset included_proteins = set() for filename in os.listdir('non ppi residues'): match = re.match('(\d...)\.csv', filename) if match is not None: included_proteins.add(match.group(1)) for pdbid in groupdict.keys(): if pdbid.upper() not in included_proteins: del groupdict[pdbid] cmd.delete(pdbid) # In case something goes wrong, so you can look at the work in progress: stored.groupdict = groupdict # Delete 1E54, since its interface is included in the non_ppi dataset cmd.delete('1E54') del groupdict['1E54'] cs_make_selections(groupdict) # Change from line to cartoon representation cmd.hide('lines','*') cmd.show('cartoon','*') return groupdict
print('Alignments loaded... ' + repr(oracles)) # Calculate the moments for the pdb sequences pdb_moments = CIDict([(structure.get_id(), moment(structure, resi_lists[structure.get_id()], centers[structure.get_id()], partial(calculator_adapter, calc), oracles[structure.get_id()].pdb_sequence())) for structure in structures]) print('pdb moments calculated! ' + repr(pdb_moments)) # Calculate the family moments, that is, the moments for all # sequences in the alignments family_moments = CIDict((pdbid, list()) for pdbid in alignments.keys()) for pdbid in family_moments.keys(): for seq_index in range(len(oracles[pdbid].get_alignment())): # Calculate the moment family_moment = moment(structure_dict[pdbid], resi_lists[pdbid], centers[pdbid], partial(calculator_adapter, calc), oracles[pdbid].sequence(seq_index)) # Calculate the %identity with the pdb sequence pdb_sequence = oracles[pdbid].get_pdb_seq_record().seq sequence = oracles[pdbid].get_alignment()[seq_index].seq normalized_distance = matrices.compare(pdb_sequence, sequence, identity) seq_id = oracles[pdbid].get_alignment()[seq_index].id
# Retrieve Daniel's aligned structures structures = CIDict() parser = PDBParser() with warnings.catch_warnings(): # When importing Daniel's aligned structures, the PDBParser gives # warnings about "invalid or missing" b factors and occupancies # There are so many that if you let it display warnings it'll never # finish parsing warnings.simplefilter('ignore') for pdbid in ('1A0S', '1AF6'): structures.update({pdbid: \ parser.get_structure(pdbid, 'aligned_{}.pdb'.format(pdbid))}) z_coords = CIDict() for pdbid in structures.keys(): z_coords.update({pdbid: list()}) iter_sequence = iter(sequences[pdbid]) for residue in structures[pdbid].get_residues(): try: calpha = residue.child_dict['CA'] except KeyError: # HOH and other heteroatoms will not have a C-alpha # They will also not have a corresponding letter in the sequence # So, skip them continue resi = residue.get_id()[1] z = calpha.get_coord()[2] # This structure should have the same sequence as the one
class Calculator(object): ''' Carries out ez-beta calculations using a set of parameters given to it at initialization. The set of parameters must be a spreadsheet represented as a list of lists, with the inner lists representing rows. The first row must contain the one-letter codes of each amino acid for which parameters are to be given. Underneath each letter is a column containing its parameters in this order: Curve type ('gaussian' or 'sigmoidal') E0/Emin Zmid/Zmin n/sigma Calculating pseudo-energies: calculate(self, resn, z): gives pseudoenergy given a one-letter or three-letter code for an amino acid, and a z coordinate The "normalize" option is vestigial - I used to calculate what fraction an energy is of the maximum possible energy that that kind of residue can have. But, that doesn't really make much sense. It's still here so that I can rerun my old scripts if I need to. ''' def __init__(self, iterable, normalize = False): self.normalize = normalize self.ref = CIDict() colmap = CIDict() for column, letter in enumerate(iterable.next()): if letter != '': self.ref.update({letter: dict()}) colmap.update({letter: column}) curvetypes = iterable.next() for letter, column in colmap.items(): self.ref[letter].update({'curve': curvetypes[column]}) for parameter in [{'sigmoidal': 'e0', 'gaussian': 'emin'}, {'sigmoidal': 'zmid', 'gaussian': 'zmin'}, {'sigmoidal': 'n', 'gaussian': 'sigma'}]: paramrow = iterable.next() for letter in self.ref.keys(): curvetype = self.ref[letter]['curve'] self.ref[letter].update({parameter[curvetype]: \ float(paramrow[colmap[letter]])}) def calculate(self, resn, z): ''' gives pseudoenergy given a one-letter or three-letter code for an amino acid, and a z coordinate Raises a NoParameters exception when you use an amino acid that it doesn't have parameters for; always have some way of handling this when you call this method! ''' if len(resn) == 3: resn = one_letter[resn] try: params = self.ref[resn] except KeyError: raise NoParameters('No parameters for resn ' + str(resn)) if params['curve'] == 'gaussian': output = params['emin'] * \ math.exp(-1*(abs(z)-params['zmin'])**2 \ /(2*params['sigma']**2)) elif params['curve'] == 'sigmoidal': output = params['e0']/(1+(abs(z)/params['zmid'])**params['n']) if self.normalize: if params['curve'] == 'gaussian': output /= params['emin'] # Normalized trends are high energy in middle of the membrane # for sigmoidal, high energy in the head-group region for # gaussian. For aromatics and small hydrophobics (anything with # negative E0 or Emin) these trends should be reversed if params['emin'] < 0: output = 1 - output if params['curve'] == 'sigmoidal': output /= params['e0'] if params['e0'] < 0: output = 1 - output return output
# I expect there to only be one pdbid assert len(pdbid_set) == 1, 'more than 1 pdbid in one spreadsheet' pdbid = list(pdbid_set)[0] weights.update({pdbid: spreadsheet}) # selections maps pdbids to sets of resis selections = CIDict() for pdbid, spreadsheet in weights.items(): def not_blank(string): return string != '' resis = filter(not_blank, spreadsheet.get_column('resi')) selections.update({pdbid: set(resis)}) # A new global variable for looping over these proteins asymmetric_dataset = CIDict([(pdbid, groupdict[pdbid]) \ for pdbid in weights.keys()]) # Make the spreadsheets available through groupdict for pdbid, group in asymmetric_dataset.items(): group.non_ppi = weights[pdbid] # Make selection for pdbid, selection in selections.items(): cmd.select(pdbid.upper() + '.non_ppi', 'none') for resi in selection: cmd.select(pdbid.upper() + '.non_ppi', '{0}.molecule & i. {1} | {0}.non_ppi' \ .format(pdbid.upper(),resi)) finally: