def selections_by_index(iterable): selections = CIDict() test_sequences = CIDict() for line in iterable: selections.update({line[0]: [int(x) for x in line[2:]]}) test_sequences.update({line[0]: line[1]}) return selections, test_sequences
def load_centers(iterable): dict_ = CIDict() for row in iterable: if row[0] != '': dict_.update({row[0]:row[1]}) for key, value in dict_.items(): # Turns '(1,2,3)' etc, that is, textual representations of vectors, # into Vector objects. Will cut off the last digit of the third # componant, but I don't care because the third componant will # always be 0.0 dict_[key] = np.array([float(y[:-1]) for y in value[1:].split()]) return dict_
def __init__(self, iterable, normalize = False): self.normalize = normalize self.ref = CIDict() colmap = CIDict() for column, letter in enumerate(iterable.next()): if letter != '': self.ref.update({letter: dict()}) colmap.update({letter: column}) curvetypes = iterable.next() for letter, column in colmap.items(): self.ref[letter].update({'curve': curvetypes[column]}) for parameter in [{'sigmoidal': 'e0', 'gaussian': 'emin'}, {'sigmoidal': 'zmid', 'gaussian': 'zmin'}, {'sigmoidal': 'n', 'gaussian': 'sigma'}]: paramrow = iterable.next() for letter in self.ref.keys(): curvetype = self.ref[letter]['curve'] self.ref[letter].update({parameter[curvetype]: \ float(paramrow[colmap[letter]])})
from sundries import one_letter from Bio.PDB import PDBParser import warnings import csv # Retrieve the sequences from the BBTMOUT alignment, including -'s for gaps bbtm_align= list(AlignIO.read('Swiss-PDB structural alignment.aln', 'clustal')) # Assuming the first is 1A0S, the second is 1AF6: sequences = CIDict((('1A0S',str(bbtm_align[0].seq)), ('1AF6',str(bbtm_align[1].seq)))) # Check that I'm right about the first being 1AF6, the second being 1A0S firstfive_of = CIDict() for pdbid, sequence in sequences.items(): firstfive_of.update({pdbid: ''}) for letter in sequence: if letter != '-': firstfive_of[pdbid] += letter if len(firstfive_of[pdbid]) == 5: break assertion_error_message = 'wrong aligned sequences in sequences dictionary' assertion_error_message += ": 1a0s's first five are {},"\ .format(firstfive_of['1a0s'])\ +" and 1af6's first five are {}"\ .format(firstfive_of['1af6']) assert firstfive_of['1a0s'] == 'SGFEF' \ and firstfive_of['1af6'] == 'VDFHG',\ assertion_error_message
class Calculator(object): ''' Carries out ez-beta calculations using a set of parameters given to it at initialization. The set of parameters must be a spreadsheet represented as a list of lists, with the inner lists representing rows. The first row must contain the one-letter codes of each amino acid for which parameters are to be given. Underneath each letter is a column containing its parameters in this order: Curve type ('gaussian' or 'sigmoidal') E0/Emin Zmid/Zmin n/sigma Calculating pseudo-energies: calculate(self, resn, z): gives pseudoenergy given a one-letter or three-letter code for an amino acid, and a z coordinate The "normalize" option is vestigial - I used to calculate what fraction an energy is of the maximum possible energy that that kind of residue can have. But, that doesn't really make much sense. It's still here so that I can rerun my old scripts if I need to. ''' def __init__(self, iterable, normalize = False): self.normalize = normalize self.ref = CIDict() colmap = CIDict() for column, letter in enumerate(iterable.next()): if letter != '': self.ref.update({letter: dict()}) colmap.update({letter: column}) curvetypes = iterable.next() for letter, column in colmap.items(): self.ref[letter].update({'curve': curvetypes[column]}) for parameter in [{'sigmoidal': 'e0', 'gaussian': 'emin'}, {'sigmoidal': 'zmid', 'gaussian': 'zmin'}, {'sigmoidal': 'n', 'gaussian': 'sigma'}]: paramrow = iterable.next() for letter in self.ref.keys(): curvetype = self.ref[letter]['curve'] self.ref[letter].update({parameter[curvetype]: \ float(paramrow[colmap[letter]])}) def calculate(self, resn, z): ''' gives pseudoenergy given a one-letter or three-letter code for an amino acid, and a z coordinate Raises a NoParameters exception when you use an amino acid that it doesn't have parameters for; always have some way of handling this when you call this method! ''' if len(resn) == 3: resn = one_letter[resn] try: params = self.ref[resn] except KeyError: raise NoParameters('No parameters for resn ' + str(resn)) if params['curve'] == 'gaussian': output = params['emin'] * \ math.exp(-1*(abs(z)-params['zmin'])**2 \ /(2*params['sigma']**2)) elif params['curve'] == 'sigmoidal': output = params['e0']/(1+(abs(z)/params['zmid'])**params['n']) if self.normalize: if params['curve'] == 'gaussian': output /= params['emin'] # Normalized trends are high energy in middle of the membrane # for sigmoidal, high energy in the head-group region for # gaussian. For aromatics and small hydrophobics (anything with # negative E0 or Emin) these trends should be reversed if params['emin'] < 0: output = 1 - output if params['curve'] == 'sigmoidal': output /= params['e0'] if params['e0'] < 0: output = 1 - output return output
for residue in group.below_18_res] csv.writer(f).writerow(row_to_write) # Selection of residues that were not removed manually. Manually # removed residues look like they are on the inside of the protein, or # on the water-facing rather than lipid-facing part of their surface. # Very subjective, but a lot of stuff that definitely needs to be removed # was. manually_removed = CIDict() with open('removed.csv', 'rb') as f: for row in csv.reader(f): pdbid = row[0] resis = (int(i) for i in row[1:]) manually_removed.update({pdbid: resis}) for group in groupdict.values(): group.manually_removed = set(manually_removed[group.name]) def not_removed(group, residue): return residue.get_id()[1] not in group.manually_removed for group in groupdict.values(): group.not_removed_res = group.selection(not_removed) with open('selections/not removed.csv', 'wb') as f: for group in groupdict.values(): row_to_write = list() row_to_write.append(group.name)
# Make spreadsheets spreadsheets = [biodata.Spreadsheet(filename, phrasebook = phrasebooks['weights'])\ for filename in weight_file_paths] # weights maps pdbids to spreadsheets weights = CIDict() for spreadsheet in spreadsheets: pdbid_list = spreadsheet.get_column('pdbid') pdbid_filtered = filter(lambda x: x != '', pdbid_list) pdbid_set = set(pdbid_filtered) # I expect there to only be one pdbid assert len(pdbid_set) == 1, 'more than 1 pdbid in one spreadsheet' pdbid = list(pdbid_set)[0] weights.update({pdbid: spreadsheet}) # selections maps pdbids to sets of resis selections = CIDict() for pdbid, spreadsheet in weights.items(): def not_blank(string): return string != '' resis = filter(not_blank, spreadsheet.get_column('resi')) selections.update({pdbid: set(resis)}) # A new global variable for looping over these proteins asymmetric_dataset = CIDict([(pdbid, groupdict[pdbid]) \ for pdbid in weights.keys()]) # Make the spreadsheets available through groupdict for pdbid, group in asymmetric_dataset.items():
def selections_by_resi(iterable): selections = CIDict() for line in iterable: selections.update({line[0]: [int(x) for x in line[1:]]}) return selections