def create_session(workingdir, load = True):
    # Load structures:
    groupdict = CIDict(groups_from_folder(workingdir + '/structures',
                                   ['aligned_(.*).pdb'], load = load))
                                   
    # Remove structures not in the datset
    included_proteins = set()
    for filename in os.listdir('non ppi residues'):
        match = re.match('(\d...)\.csv', filename) 
        if match is not None:
            included_proteins.add(match.group(1))
    
    for pdbid in groupdict.keys():
        if pdbid.upper() not in included_proteins:
            del groupdict[pdbid]
            cmd.delete(pdbid)

    # In case something goes wrong, so you can look at the work in progress:
    stored.groupdict = groupdict
    
    # Delete 1E54, since its interface is included in the non_ppi dataset
    cmd.delete('1E54')
    del groupdict['1E54']
    
    cs_make_selections(groupdict)    
    
    # Change from line to cartoon representation
    cmd.hide('lines','*')
    cmd.show('cartoon','*')
    
    
    
    return groupdict
def families(params='published params.csv', sanity_file=None):
    '''Return a dictionary of all families in the dataset.'''
    # Map PDBIDs to paths of structures
    stru_path_list = glob.glob('structures/aligned_*.pdb')
    match_str = r'structures[/\\]aligned_(....)\.pdb'
    pdbids = [re.match(match_str, path).group(1) \
              for path in stru_path_list]
    stru_path = CIDict(zip(pdbids, stru_path_list))
    
    # Map PDBIDS to paths of multiple sequence alignments
    msa_path_list = glob.glob('gonnet aligned/* with *.clu')
    match_str = r'gonnet aligned[/\\](....) with .*\.clu'
    pdbids = [re.match(match_str, path).group(1) for path in msa_path_list]
    msa_path = CIDict(zip(pdbids, msa_path_list))
    
    # Map PDBIDs to names of sequences of the structure
    template_id = CIDict((pdbid, 'template_' + pdbid.upper())
                          for pdbid in msa_path.keys())
    
    if sanity_file is not None:
        with open(sanity_file, 'w') as f:
            f.write(repr(stru_path))
            f.write('\n')
            f.write(repr(msa_path))
            f.write('\n')
            f.write(repr(template_id))
    # Create the families
    return CIDict((pdbid, Family(pdbid, stru_path[pdbid], msa_path[pdbid],
                          template_id[pdbid], params))
                   for pdbid in msa_path.keys())
def load_centers(iterable):  
    dict_ = CIDict()
    for row in iterable:
        if row[0] != '':
            dict_.update({row[0]:row[1]})
    for key, value in dict_.items():
        # Turns '(1,2,3)' etc, that is, textual representations of vectors,
        # into Vector objects. Will cut off the last digit of the third
        # componant, but I don't care because the third componant will
        # always be 0.0
        dict_[key] = np.array([float(y[:-1]) for y in value[1:].split()])
    return dict_
    def __init__(self, iterable, normalize = False):

        self.normalize = normalize

        self.ref = CIDict()
        colmap = CIDict()
        for column, letter in enumerate(iterable.next()):
            if letter != '':
                self.ref.update({letter: dict()})
                colmap.update({letter: column})
        curvetypes = iterable.next()
        for letter, column in colmap.items():
            self.ref[letter].update({'curve': curvetypes[column]})
        for parameter in [{'sigmoidal': 'e0', 'gaussian': 'emin'},
                          {'sigmoidal': 'zmid', 'gaussian': 'zmin'},
                          {'sigmoidal': 'n', 'gaussian': 'sigma'}]:
            paramrow = iterable.next()
            for letter in self.ref.keys():
                curvetype = self.ref[letter]['curve']
                self.ref[letter].update({parameter[curvetype]: \
                                         float(paramrow[colmap[letter]])})
def selections_by_index(iterable):
    selections = CIDict()
    test_sequences = CIDict()
    for line in iterable:
        selections.update({line[0]: [int(x) for x in line[2:]]})
        test_sequences.update({line[0]: line[1]})
    return selections, test_sequences
oracles = CIDict([(pdbid, AlignmentOracle(alignment, pdb_name = 'chaina'))\
                  for pdbid, alignment in alignments.items()])
print('Alignments loaded... ' + repr(oracles))

# Calculate the moments for the pdb sequences
pdb_moments = CIDict([(structure.get_id(),
                   moment(structure, resi_lists[structure.get_id()],
                          centers[structure.get_id()],
                          partial(calculator_adapter, calc),
                          oracles[structure.get_id()].pdb_sequence()))
           for structure in structures])
print('pdb moments calculated! ' + repr(pdb_moments))

# Calculate the family moments, that is, the moments for all 
# sequences in the alignments
family_moments = CIDict((pdbid, list()) for pdbid in alignments.keys())

for pdbid in family_moments.keys():
    for seq_index in range(len(oracles[pdbid].get_alignment())):
        # Calculate the moment
        family_moment = moment(structure_dict[pdbid], resi_lists[pdbid],
                               centers[pdbid],
                               partial(calculator_adapter, calc),
                               oracles[pdbid].sequence(seq_index))

        # Calculate the %identity with the pdb sequence
        pdb_sequence = oracles[pdbid].get_pdb_seq_record().seq
        sequence = oracles[pdbid].get_alignment()[seq_index].seq
        normalized_distance = matrices.compare(pdb_sequence, sequence,
                                               identity)
from __future__ import division
from Bio import AlignIO
from sundries import CIDict
from sundries import one_letter
from Bio.PDB import PDBParser
import warnings
import csv

# Retrieve the sequences from the BBTMOUT alignment, including -'s for gaps
bbtm_align= list(AlignIO.read('Swiss-PDB structural alignment.aln',
                          'clustal'))
# Assuming the first is 1A0S, the second is 1AF6:
sequences = CIDict((('1A0S',str(bbtm_align[0].seq)),
                    ('1AF6',str(bbtm_align[1].seq))))

# Check that I'm right about the first being 1AF6, the second being 1A0S
firstfive_of = CIDict()
for pdbid, sequence in sequences.items():
    firstfive_of.update({pdbid: ''})
    for letter in sequence:
        if letter != '-':
            firstfive_of[pdbid] += letter
        if len(firstfive_of[pdbid]) == 5:
            break

assertion_error_message = 'wrong aligned sequences in sequences dictionary'
assertion_error_message += ": 1a0s's first five are {},"\
                           .format(firstfive_of['1a0s'])\
                           +" and 1af6's first five are {}"\
                           .format(firstfive_of['1af6'])
import csv

# Retrieve the sequences from the BBTMOUT alignment, including -'s for gaps
bbtm_align = list(AlignIO.read('1a0s 1af6 pairwise bbtmout align.clu',
                          'clustal'))
# Assuming the first is 1A0S, the second is 1AF6:
sequences = CIDict((('1A0S',str(bbtm_align[0].seq)),
                    ('1AF6',str(bbtm_align[1].seq))))

# Check that I'm right about the first being 1A0S, the second being 1AF6
assert sequences['1A0S'][:8] == 'SGFEFHGY' \
       and sequences['1AF6'][7:11] == 'VDFH',\
       'wrong aligned sequences in "sequences" dictionary'

# Retrieve Daniel's aligned structures
structures = CIDict()
parser = PDBParser()
with warnings.catch_warnings():
    # When importing Daniel's aligned structures, the PDBParser gives
    # warnings about "invalid or missing" b factors and occupancies
    # There are so many that if you let it display warnings it'll never
    # finish parsing
    warnings.simplefilter('ignore')
    for pdbid in ('1A0S', '1AF6'):
        structures.update({pdbid: \
                           parser.get_structure(pdbid,
                           'aligned_{}.pdb'.format(pdbid))})
z_coords = CIDict()
for pdbid in structures.keys():
    z_coords.update({pdbid: list()})
    iter_sequence = iter(sequences[pdbid])
class Calculator(object):
    '''
    Carries out ez-beta calculations using a set of parameters given to
    it at initialization.
    
    The set of parameters must be a spreadsheet represented as
    a list of lists, with the inner lists representing rows. The first row
    must contain the one-letter codes of each amino acid for which
    parameters
    are to be given. Underneath each letter is a column containing its
    parameters in this order:
    Curve type ('gaussian' or 'sigmoidal')
    E0/Emin
    Zmid/Zmin
    n/sigma

    Calculating pseudo-energies:
    calculate(self, resn, z): gives pseudoenergy given a one-letter or
    three-letter code for an amino acid, and a z coordinate

    The "normalize" option is vestigial - I used to calculate what fraction
    an energy is of the maximum possible energy that that kind of residue
    can have. But, that doesn't really make much sense. It's still here
    so that I can rerun my old scripts if I need to.
    '''
    
    def __init__(self, iterable, normalize = False):

        self.normalize = normalize

        self.ref = CIDict()
        colmap = CIDict()
        for column, letter in enumerate(iterable.next()):
            if letter != '':
                self.ref.update({letter: dict()})
                colmap.update({letter: column})
        curvetypes = iterable.next()
        for letter, column in colmap.items():
            self.ref[letter].update({'curve': curvetypes[column]})
        for parameter in [{'sigmoidal': 'e0', 'gaussian': 'emin'},
                          {'sigmoidal': 'zmid', 'gaussian': 'zmin'},
                          {'sigmoidal': 'n', 'gaussian': 'sigma'}]:
            paramrow = iterable.next()
            for letter in self.ref.keys():
                curvetype = self.ref[letter]['curve']
                self.ref[letter].update({parameter[curvetype]: \
                                         float(paramrow[colmap[letter]])})

    def calculate(self, resn, z):
        '''
        gives pseudoenergy given a one-letter or
        three-letter code for an amino acid, and a z coordinate
        
        Raises a NoParameters exception when you use an amino acid that
        it doesn't have parameters for; always have some way of handling
        this when you call this method!
        '''

        if len(resn) == 3:
            resn = one_letter[resn]        

        try:
            params = self.ref[resn]
        except KeyError:
            raise NoParameters('No parameters for resn ' + str(resn))
        
        if params['curve'] == 'gaussian':
            output = params['emin'] * \
                     math.exp(-1*(abs(z)-params['zmin'])**2 \
                                 /(2*params['sigma']**2))
        elif params['curve'] == 'sigmoidal':
            output = params['e0']/(1+(abs(z)/params['zmid'])**params['n'])
        
        if self.normalize:
            if params['curve'] == 'gaussian':
                output /= params['emin']
                # Normalized trends are high energy in middle of the membrane
                # for sigmoidal, high energy in the head-group region for
                # gaussian. For aromatics and small hydrophobics (anything with
                # negative E0 or Emin) these trends should be reversed
                if params['emin'] < 0:
                    output = 1 - output

            if params['curve'] == 'sigmoidal':
                output /= params['e0']
                if params['e0'] < 0:
                    output = 1 - output

        return output
# Remove 1E54, since its interface is included in the non_ppi dataset
# This will throw an exception if 1E54 isn't in the list, so don't
# worry about case
pdbids.remove('1E54')

# Actually, I don't want any of the oligomers, the strand count
# difficulty is too much weighing on my mind
pdbids.remove('1A0S')
pdbids.remove('1QD6')
pdbids.remove('2J1N')
pdbids.remove('2O4V')

# Make group objects. I'm going to be associating a lot of stuff with
# each protein, it's easiest to just group them together.
groupdict = CIDict([(pdbid, Group(pdbid)) for pdbid in pdbids])

# The slow part: load structures.
structure_dir = '../pymol/structures'
def filename(pdbid):
    return structure_dir + '/aligned_{}.pdb'.format(pdbid)
# Daniel's aligned structures give "invalid/missing occupancy" and
# "invalid/missing B factor" warnings - thousands of them! Have to filter
# warnings or the structures won't get loaded
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    for group in groupdict.values():
        group.structure = PDBParser().get_structure(group.name,
                                                    filename(group.name))

print('structures loaded after ' + str(time.time() - start))
    def filename_match(filename):
        return re.match('\d...\.csv', filename) is not None

    weight_file_filenames = filter(filename_match, filename_superset)
    weight_file_paths = [folder + '/' + filename \
                         for filename in weight_file_filenames]


    # Make spreadsheets
    spreadsheets = [biodata.Spreadsheet(filename,
                                        phrasebook = phrasebooks['weights'])\
                    for filename in weight_file_paths]

    # weights maps pdbids to spreadsheets
    weights = CIDict()
    for spreadsheet in spreadsheets:
        pdbid_list = spreadsheet.get_column('pdbid')
        pdbid_filtered = filter(lambda x: x != '', pdbid_list)
        pdbid_set = set(pdbid_filtered)
        # I expect there to only be one pdbid
        assert len(pdbid_set) == 1, 'more than 1 pdbid in one spreadsheet'
        pdbid = list(pdbid_set)[0]
        weights.update({pdbid: spreadsheet})

    # selections maps pdbids to sets of resis
    selections = CIDict()
    for pdbid, spreadsheet in weights.items():
        def not_blank(string):
            return string != ''
        resis = filter(not_blank, spreadsheet.get_column('resi'))
def selections_by_resi(iterable):
    selections = CIDict()
    for line in iterable:
        selections.update({line[0]: [int(x) for x in line[1:]]})
    return selections