def selections_by_index(iterable):
    selections = CIDict()
    test_sequences = CIDict()
    for line in iterable:
        selections.update({line[0]: [int(x) for x in line[2:]]})
        test_sequences.update({line[0]: line[1]})
    return selections, test_sequences
def load_centers(iterable):  
    dict_ = CIDict()
    for row in iterable:
        if row[0] != '':
            dict_.update({row[0]:row[1]})
    for key, value in dict_.items():
        # Turns '(1,2,3)' etc, that is, textual representations of vectors,
        # into Vector objects. Will cut off the last digit of the third
        # componant, but I don't care because the third componant will
        # always be 0.0
        dict_[key] = np.array([float(y[:-1]) for y in value[1:].split()])
    return dict_
    def __init__(self, iterable, normalize = False):

        self.normalize = normalize

        self.ref = CIDict()
        colmap = CIDict()
        for column, letter in enumerate(iterable.next()):
            if letter != '':
                self.ref.update({letter: dict()})
                colmap.update({letter: column})
        curvetypes = iterable.next()
        for letter, column in colmap.items():
            self.ref[letter].update({'curve': curvetypes[column]})
        for parameter in [{'sigmoidal': 'e0', 'gaussian': 'emin'},
                          {'sigmoidal': 'zmid', 'gaussian': 'zmin'},
                          {'sigmoidal': 'n', 'gaussian': 'sigma'}]:
            paramrow = iterable.next()
            for letter in self.ref.keys():
                curvetype = self.ref[letter]['curve']
                self.ref[letter].update({parameter[curvetype]: \
                                         float(paramrow[colmap[letter]])})
from sundries import one_letter
from Bio.PDB import PDBParser
import warnings
import csv

# Retrieve the sequences from the BBTMOUT alignment, including -'s for gaps
bbtm_align= list(AlignIO.read('Swiss-PDB structural alignment.aln',
                          'clustal'))
# Assuming the first is 1A0S, the second is 1AF6:
sequences = CIDict((('1A0S',str(bbtm_align[0].seq)),
                    ('1AF6',str(bbtm_align[1].seq))))

# Check that I'm right about the first being 1AF6, the second being 1A0S
firstfive_of = CIDict()
for pdbid, sequence in sequences.items():
    firstfive_of.update({pdbid: ''})
    for letter in sequence:
        if letter != '-':
            firstfive_of[pdbid] += letter
        if len(firstfive_of[pdbid]) == 5:
            break

assertion_error_message = 'wrong aligned sequences in sequences dictionary'
assertion_error_message += ": 1a0s's first five are {},"\
                           .format(firstfive_of['1a0s'])\
                           +" and 1af6's first five are {}"\
                           .format(firstfive_of['1af6'])

assert firstfive_of['1a0s'] == 'SGFEF' \
       and firstfive_of['1af6'] == 'VDFHG',\
       assertion_error_message
class Calculator(object):
    '''
    Carries out ez-beta calculations using a set of parameters given to
    it at initialization.
    
    The set of parameters must be a spreadsheet represented as
    a list of lists, with the inner lists representing rows. The first row
    must contain the one-letter codes of each amino acid for which
    parameters
    are to be given. Underneath each letter is a column containing its
    parameters in this order:
    Curve type ('gaussian' or 'sigmoidal')
    E0/Emin
    Zmid/Zmin
    n/sigma

    Calculating pseudo-energies:
    calculate(self, resn, z): gives pseudoenergy given a one-letter or
    three-letter code for an amino acid, and a z coordinate

    The "normalize" option is vestigial - I used to calculate what fraction
    an energy is of the maximum possible energy that that kind of residue
    can have. But, that doesn't really make much sense. It's still here
    so that I can rerun my old scripts if I need to.
    '''
    
    def __init__(self, iterable, normalize = False):

        self.normalize = normalize

        self.ref = CIDict()
        colmap = CIDict()
        for column, letter in enumerate(iterable.next()):
            if letter != '':
                self.ref.update({letter: dict()})
                colmap.update({letter: column})
        curvetypes = iterable.next()
        for letter, column in colmap.items():
            self.ref[letter].update({'curve': curvetypes[column]})
        for parameter in [{'sigmoidal': 'e0', 'gaussian': 'emin'},
                          {'sigmoidal': 'zmid', 'gaussian': 'zmin'},
                          {'sigmoidal': 'n', 'gaussian': 'sigma'}]:
            paramrow = iterable.next()
            for letter in self.ref.keys():
                curvetype = self.ref[letter]['curve']
                self.ref[letter].update({parameter[curvetype]: \
                                         float(paramrow[colmap[letter]])})

    def calculate(self, resn, z):
        '''
        gives pseudoenergy given a one-letter or
        three-letter code for an amino acid, and a z coordinate
        
        Raises a NoParameters exception when you use an amino acid that
        it doesn't have parameters for; always have some way of handling
        this when you call this method!
        '''

        if len(resn) == 3:
            resn = one_letter[resn]        

        try:
            params = self.ref[resn]
        except KeyError:
            raise NoParameters('No parameters for resn ' + str(resn))
        
        if params['curve'] == 'gaussian':
            output = params['emin'] * \
                     math.exp(-1*(abs(z)-params['zmin'])**2 \
                                 /(2*params['sigma']**2))
        elif params['curve'] == 'sigmoidal':
            output = params['e0']/(1+(abs(z)/params['zmid'])**params['n'])
        
        if self.normalize:
            if params['curve'] == 'gaussian':
                output /= params['emin']
                # Normalized trends are high energy in middle of the membrane
                # for sigmoidal, high energy in the head-group region for
                # gaussian. For aromatics and small hydrophobics (anything with
                # negative E0 or Emin) these trends should be reversed
                if params['emin'] < 0:
                    output = 1 - output

            if params['curve'] == 'sigmoidal':
                output /= params['e0']
                if params['e0'] < 0:
                    output = 1 - output

        return output
                         for residue in group.below_18_res]
        csv.writer(f).writerow(row_to_write)


# Selection of residues that were not removed manually. Manually 
# removed residues look like they are on the inside of the protein, or
# on the water-facing rather than lipid-facing part of their surface.
# Very subjective, but a lot of stuff that definitely needs to be removed
# was.

manually_removed = CIDict()
with open('removed.csv', 'rb') as f:
    for row in csv.reader(f):
        pdbid = row[0]
        resis = (int(i) for i in row[1:])
        manually_removed.update({pdbid: resis})
    
for group in groupdict.values():
    group.manually_removed = set(manually_removed[group.name])

    
def not_removed(group, residue):
    return residue.get_id()[1] not in group.manually_removed

for group in groupdict.values():
    group.not_removed_res = group.selection(not_removed)

with open('selections/not removed.csv', 'wb') as f:
    for group in groupdict.values():
        row_to_write = list()
        row_to_write.append(group.name)
    # Make spreadsheets
    spreadsheets = [biodata.Spreadsheet(filename,
                                        phrasebook = phrasebooks['weights'])\
                    for filename in weight_file_paths]

    # weights maps pdbids to spreadsheets
    weights = CIDict()
    for spreadsheet in spreadsheets:
        pdbid_list = spreadsheet.get_column('pdbid')
        pdbid_filtered = filter(lambda x: x != '', pdbid_list)
        pdbid_set = set(pdbid_filtered)
        # I expect there to only be one pdbid
        assert len(pdbid_set) == 1, 'more than 1 pdbid in one spreadsheet'
        pdbid = list(pdbid_set)[0]
        weights.update({pdbid: spreadsheet})

    # selections maps pdbids to sets of resis
    selections = CIDict()
    for pdbid, spreadsheet in weights.items():
        def not_blank(string):
            return string != ''
        resis = filter(not_blank, spreadsheet.get_column('resi'))
        selections.update({pdbid: set(resis)})

    # A new global variable for looping over these proteins
    asymmetric_dataset = CIDict([(pdbid, groupdict[pdbid]) \
                                 for pdbid in weights.keys()])

    # Make the spreadsheets available through groupdict
    for pdbid, group in asymmetric_dataset.items():
def selections_by_resi(iterable):
    selections = CIDict()
    for line in iterable:
        selections.update({line[0]: [int(x) for x in line[1:]]})
    return selections