def create_session(workingdir, load = True): # Load structures: groupdict = CIDict(groups_from_folder(workingdir + '/structures', ['aligned_(.*).pdb'], load = load)) # Remove structures not in the datset included_proteins = set() for filename in os.listdir('non ppi residues'): match = re.match('(\d...)\.csv', filename) if match is not None: included_proteins.add(match.group(1)) for pdbid in groupdict.keys(): if pdbid.upper() not in included_proteins: del groupdict[pdbid] cmd.delete(pdbid) # In case something goes wrong, so you can look at the work in progress: stored.groupdict = groupdict # Delete 1E54, since its interface is included in the non_ppi dataset cmd.delete('1E54') del groupdict['1E54'] cs_make_selections(groupdict) # Change from line to cartoon representation cmd.hide('lines','*') cmd.show('cartoon','*') return groupdict
def families(params='published params.csv', sanity_file=None): '''Return a dictionary of all families in the dataset.''' # Map PDBIDs to paths of structures stru_path_list = glob.glob('structures/aligned_*.pdb') match_str = r'structures[/\\]aligned_(....)\.pdb' pdbids = [re.match(match_str, path).group(1) \ for path in stru_path_list] stru_path = CIDict(zip(pdbids, stru_path_list)) # Map PDBIDS to paths of multiple sequence alignments msa_path_list = glob.glob('gonnet aligned/* with *.clu') match_str = r'gonnet aligned[/\\](....) with .*\.clu' pdbids = [re.match(match_str, path).group(1) for path in msa_path_list] msa_path = CIDict(zip(pdbids, msa_path_list)) # Map PDBIDs to names of sequences of the structure template_id = CIDict((pdbid, 'template_' + pdbid.upper()) for pdbid in msa_path.keys()) if sanity_file is not None: with open(sanity_file, 'w') as f: f.write(repr(stru_path)) f.write('\n') f.write(repr(msa_path)) f.write('\n') f.write(repr(template_id)) # Create the families return CIDict((pdbid, Family(pdbid, stru_path[pdbid], msa_path[pdbid], template_id[pdbid], params)) for pdbid in msa_path.keys())
def load_centers(iterable): dict_ = CIDict() for row in iterable: if row[0] != '': dict_.update({row[0]:row[1]}) for key, value in dict_.items(): # Turns '(1,2,3)' etc, that is, textual representations of vectors, # into Vector objects. Will cut off the last digit of the third # componant, but I don't care because the third componant will # always be 0.0 dict_[key] = np.array([float(y[:-1]) for y in value[1:].split()]) return dict_
def __init__(self, iterable, normalize = False): self.normalize = normalize self.ref = CIDict() colmap = CIDict() for column, letter in enumerate(iterable.next()): if letter != '': self.ref.update({letter: dict()}) colmap.update({letter: column}) curvetypes = iterable.next() for letter, column in colmap.items(): self.ref[letter].update({'curve': curvetypes[column]}) for parameter in [{'sigmoidal': 'e0', 'gaussian': 'emin'}, {'sigmoidal': 'zmid', 'gaussian': 'zmin'}, {'sigmoidal': 'n', 'gaussian': 'sigma'}]: paramrow = iterable.next() for letter in self.ref.keys(): curvetype = self.ref[letter]['curve'] self.ref[letter].update({parameter[curvetype]: \ float(paramrow[colmap[letter]])})
def selections_by_index(iterable): selections = CIDict() test_sequences = CIDict() for line in iterable: selections.update({line[0]: [int(x) for x in line[2:]]}) test_sequences.update({line[0]: line[1]}) return selections, test_sequences
oracles = CIDict([(pdbid, AlignmentOracle(alignment, pdb_name = 'chaina'))\ for pdbid, alignment in alignments.items()]) print('Alignments loaded... ' + repr(oracles)) # Calculate the moments for the pdb sequences pdb_moments = CIDict([(structure.get_id(), moment(structure, resi_lists[structure.get_id()], centers[structure.get_id()], partial(calculator_adapter, calc), oracles[structure.get_id()].pdb_sequence())) for structure in structures]) print('pdb moments calculated! ' + repr(pdb_moments)) # Calculate the family moments, that is, the moments for all # sequences in the alignments family_moments = CIDict((pdbid, list()) for pdbid in alignments.keys()) for pdbid in family_moments.keys(): for seq_index in range(len(oracles[pdbid].get_alignment())): # Calculate the moment family_moment = moment(structure_dict[pdbid], resi_lists[pdbid], centers[pdbid], partial(calculator_adapter, calc), oracles[pdbid].sequence(seq_index)) # Calculate the %identity with the pdb sequence pdb_sequence = oracles[pdbid].get_pdb_seq_record().seq sequence = oracles[pdbid].get_alignment()[seq_index].seq normalized_distance = matrices.compare(pdb_sequence, sequence, identity)
from __future__ import division from Bio import AlignIO from sundries import CIDict from sundries import one_letter from Bio.PDB import PDBParser import warnings import csv # Retrieve the sequences from the BBTMOUT alignment, including -'s for gaps bbtm_align= list(AlignIO.read('Swiss-PDB structural alignment.aln', 'clustal')) # Assuming the first is 1A0S, the second is 1AF6: sequences = CIDict((('1A0S',str(bbtm_align[0].seq)), ('1AF6',str(bbtm_align[1].seq)))) # Check that I'm right about the first being 1AF6, the second being 1A0S firstfive_of = CIDict() for pdbid, sequence in sequences.items(): firstfive_of.update({pdbid: ''}) for letter in sequence: if letter != '-': firstfive_of[pdbid] += letter if len(firstfive_of[pdbid]) == 5: break assertion_error_message = 'wrong aligned sequences in sequences dictionary' assertion_error_message += ": 1a0s's first five are {},"\ .format(firstfive_of['1a0s'])\ +" and 1af6's first five are {}"\ .format(firstfive_of['1af6'])
import csv # Retrieve the sequences from the BBTMOUT alignment, including -'s for gaps bbtm_align = list(AlignIO.read('1a0s 1af6 pairwise bbtmout align.clu', 'clustal')) # Assuming the first is 1A0S, the second is 1AF6: sequences = CIDict((('1A0S',str(bbtm_align[0].seq)), ('1AF6',str(bbtm_align[1].seq)))) # Check that I'm right about the first being 1A0S, the second being 1AF6 assert sequences['1A0S'][:8] == 'SGFEFHGY' \ and sequences['1AF6'][7:11] == 'VDFH',\ 'wrong aligned sequences in "sequences" dictionary' # Retrieve Daniel's aligned structures structures = CIDict() parser = PDBParser() with warnings.catch_warnings(): # When importing Daniel's aligned structures, the PDBParser gives # warnings about "invalid or missing" b factors and occupancies # There are so many that if you let it display warnings it'll never # finish parsing warnings.simplefilter('ignore') for pdbid in ('1A0S', '1AF6'): structures.update({pdbid: \ parser.get_structure(pdbid, 'aligned_{}.pdb'.format(pdbid))}) z_coords = CIDict() for pdbid in structures.keys(): z_coords.update({pdbid: list()}) iter_sequence = iter(sequences[pdbid])
class Calculator(object): ''' Carries out ez-beta calculations using a set of parameters given to it at initialization. The set of parameters must be a spreadsheet represented as a list of lists, with the inner lists representing rows. The first row must contain the one-letter codes of each amino acid for which parameters are to be given. Underneath each letter is a column containing its parameters in this order: Curve type ('gaussian' or 'sigmoidal') E0/Emin Zmid/Zmin n/sigma Calculating pseudo-energies: calculate(self, resn, z): gives pseudoenergy given a one-letter or three-letter code for an amino acid, and a z coordinate The "normalize" option is vestigial - I used to calculate what fraction an energy is of the maximum possible energy that that kind of residue can have. But, that doesn't really make much sense. It's still here so that I can rerun my old scripts if I need to. ''' def __init__(self, iterable, normalize = False): self.normalize = normalize self.ref = CIDict() colmap = CIDict() for column, letter in enumerate(iterable.next()): if letter != '': self.ref.update({letter: dict()}) colmap.update({letter: column}) curvetypes = iterable.next() for letter, column in colmap.items(): self.ref[letter].update({'curve': curvetypes[column]}) for parameter in [{'sigmoidal': 'e0', 'gaussian': 'emin'}, {'sigmoidal': 'zmid', 'gaussian': 'zmin'}, {'sigmoidal': 'n', 'gaussian': 'sigma'}]: paramrow = iterable.next() for letter in self.ref.keys(): curvetype = self.ref[letter]['curve'] self.ref[letter].update({parameter[curvetype]: \ float(paramrow[colmap[letter]])}) def calculate(self, resn, z): ''' gives pseudoenergy given a one-letter or three-letter code for an amino acid, and a z coordinate Raises a NoParameters exception when you use an amino acid that it doesn't have parameters for; always have some way of handling this when you call this method! ''' if len(resn) == 3: resn = one_letter[resn] try: params = self.ref[resn] except KeyError: raise NoParameters('No parameters for resn ' + str(resn)) if params['curve'] == 'gaussian': output = params['emin'] * \ math.exp(-1*(abs(z)-params['zmin'])**2 \ /(2*params['sigma']**2)) elif params['curve'] == 'sigmoidal': output = params['e0']/(1+(abs(z)/params['zmid'])**params['n']) if self.normalize: if params['curve'] == 'gaussian': output /= params['emin'] # Normalized trends are high energy in middle of the membrane # for sigmoidal, high energy in the head-group region for # gaussian. For aromatics and small hydrophobics (anything with # negative E0 or Emin) these trends should be reversed if params['emin'] < 0: output = 1 - output if params['curve'] == 'sigmoidal': output /= params['e0'] if params['e0'] < 0: output = 1 - output return output
# Remove 1E54, since its interface is included in the non_ppi dataset # This will throw an exception if 1E54 isn't in the list, so don't # worry about case pdbids.remove('1E54') # Actually, I don't want any of the oligomers, the strand count # difficulty is too much weighing on my mind pdbids.remove('1A0S') pdbids.remove('1QD6') pdbids.remove('2J1N') pdbids.remove('2O4V') # Make group objects. I'm going to be associating a lot of stuff with # each protein, it's easiest to just group them together. groupdict = CIDict([(pdbid, Group(pdbid)) for pdbid in pdbids]) # The slow part: load structures. structure_dir = '../pymol/structures' def filename(pdbid): return structure_dir + '/aligned_{}.pdb'.format(pdbid) # Daniel's aligned structures give "invalid/missing occupancy" and # "invalid/missing B factor" warnings - thousands of them! Have to filter # warnings or the structures won't get loaded with warnings.catch_warnings(): warnings.simplefilter('ignore') for group in groupdict.values(): group.structure = PDBParser().get_structure(group.name, filename(group.name)) print('structures loaded after ' + str(time.time() - start))
def filename_match(filename): return re.match('\d...\.csv', filename) is not None weight_file_filenames = filter(filename_match, filename_superset) weight_file_paths = [folder + '/' + filename \ for filename in weight_file_filenames] # Make spreadsheets spreadsheets = [biodata.Spreadsheet(filename, phrasebook = phrasebooks['weights'])\ for filename in weight_file_paths] # weights maps pdbids to spreadsheets weights = CIDict() for spreadsheet in spreadsheets: pdbid_list = spreadsheet.get_column('pdbid') pdbid_filtered = filter(lambda x: x != '', pdbid_list) pdbid_set = set(pdbid_filtered) # I expect there to only be one pdbid assert len(pdbid_set) == 1, 'more than 1 pdbid in one spreadsheet' pdbid = list(pdbid_set)[0] weights.update({pdbid: spreadsheet}) # selections maps pdbids to sets of resis selections = CIDict() for pdbid, spreadsheet in weights.items(): def not_blank(string): return string != '' resis = filter(not_blank, spreadsheet.get_column('resi'))
def selections_by_resi(iterable): selections = CIDict() for line in iterable: selections.update({line[0]: [int(x) for x in line[1:]]}) return selections