Ejemplo n.º 1
0
def audit(debug=False,source='calcs/auditor.py'):
	"""
	Command-line interface to an auditor for tracking the status of different calculations.
	"""
	if not os.path.isfile(source): raise Exception('requires source code at %s'%source)
	else: 
		from makeface import import_remote
		auditor = import_remote(source)
		auditor['CalcsAuditor'](debug=debug)
Ejemplo n.º 2
0
def interpret_docker_instructions(config, mods=None):
    """
	Read a docker configuration for running things in the docker.
	"""
    if os.path.basename(config) == 'config.py':
        raise Exception(
            'you cannot call the config file "config.py" or we have an import failure'
        )
    # import_remote wraps exec and discards builtins
    from makeface import import_remote
    if not os.path.isfile(config): raise Exception('cannot find %s' % config)
    mod = import_remote(os.path.join('./', config))
    instruct = mod['interpreter'](mods=mods)
    # validators go here
    return instruct
def get_lipid_resnames():
    """
	Brief utility for getting the lipid names from automacs.
	"""
    import makeface
    #---get an automacs landscape
    #---! DEV. needs a clone and make to work
    try:
        mod = makeface.import_remote('amx/amx')
    except:
        raise Exception(
            'please clone a copy of automacs next to omni in `amx`')
    mod['state'].force_field = 'charmm'
    Landscape = mod['Landscape']
    land = Landscape(cwd='amx/')
    #---use the landscape to get hydrogen bond donors and acceptors for lipids
    hydrogen_bond_ref = {}
    targets = land.objects_by_category('lipid')
    return targets
Ejemplo n.º 4
0
def contacts(grofile,trajfile,**kwargs):

	"""
	Identify, catalog, and count contacts in a simulation.
	Note that this code was developed to mimic the data structures used by hydrogen bonding and salt bridging
	codes, and stores the various contacts up to a very high level of specificity (i.e. the specific residue
	and atom names).
	"""

	#---unpack
	sn = kwargs['sn']
	work = kwargs['workspace']
	calc = kwargs['calc']
	debug = kwargs.get('debug',False)
	run_parallel = kwargs.get('run_parallel',True)

	#---settings
	lenscale = 10.0
	#---distance cutoff stays in angstroms until the compute function
	distance_cutoff = calc['specs']['cutoff']
	subject_selection = calc['specs'].get('subject','protein')
	object_flag = calc['specs'].get('object','lipid')

	#---prepare universe	
	uni = MDAnalysis.Universe(grofile,trajfile)
	nframes = len(uni.trajectory)
	start_job_time = time.time()

	#---save topology for later
	_,idx,counts = np.unique(uni.residues.resnames,return_index=True,return_counts=True)
	resnames = uni.residues.resnames[np.sort(idx)]
	resnames_master = np.array(resnames)
	rescounts = counts[np.argsort(idx)]

	import makeface
	#---get an automacs landscape with a little help from the user
	try: mod = makeface.import_remote('amx/amx')
	except: raise Exception('please clone a copy of automacs next to omni in `amx`. '
		'you must also run `make setup all` from that directory to get force field files.')
	ff_name = work.vars.get('force_field',None)
	if not ff_name: raise Exception('we must be very careful with the residue naming. '
		'you must add `force_field` to the `variables` dictionary in your metadata to continue.')
	mod['state'].force_field = 'charmm'
	Landscape = mod['Landscape']
	land = Landscape(cwd='amx/',ff=ff_name)

	#---get the subject of the calculation, the thing we wish to study the contacts of
	#---...typically the protein
	#---! need to add resid redundancy checks possibly
	if subject_selection=='lipids':
		#---typically the subject selection goes right to MDAnalysis but this routine allows for lipid-lipid
		#---...contacts which can be filtered for salt bridges. note that this feature is solely for ptdins
		target_resnames = land.objects_by_category('lipid')
		subject_selection = '(%s) and not name H*'%' or '.join(['resname %s'%i for i in target_resnames])
	subject = uni.select_atoms(subject_selection)
	#---get the objects
	if object_flag=='lipid':
		#---objects from the landscape returns resnames
		target_resnames = land.objects_by_category('lipid')
		#---explicitly ignore hydrogen contacts here
		targets = uni.select_atoms('(%s) and not name H*'%
			' or '.join(['resname %s'%i for i in target_resnames]))
	elif object_flag=='protein':
		target_resnames = np.unique(uni.resnames)
		#---explicitly ignore hydrogen contacts here
		targets = uni.select_atoms('(%s) and not name H*'%
			' or '.join(['resname %s'%i for i in target_resnames]))
	else: raise Exception('not set up for object %s'%object_flag)

	#---prepare coordinates for each frame
	st = time.time()
	global vecs,coords_subj,coords_targ
	vecs,coords_subj,coords_targ,times = [],[],[],[]
	#---purposefully profligate with the memory so this goes quickly
	for fr in range(nframes):
		status('caching coordinates',tag='compute',i=fr,looplen=nframes,start=st)	
		uni.trajectory[fr]
		times.append(uni.trajectory.time)
		vecs.append(uni.dimensions[:3]/lenscale)
		coords_subj.append(subject.positions/lenscale)
		coords_targ.append(targets.positions/lenscale)
	status('completed caching in %.1f minutes'%((time.time()-st)/60.),tag='status')

	#---debug
	compute_function = contacts_framewise
	if debug:
		fr = 50
		incoming = compute_function(fr,distance_cutoff=distance_cutoff,debug=True)
		import ipdb;ipdb.set_trace()
		sys.quit()

	#---compute loop
	start = time.time()
	out_args = {'distance_cutoff':distance_cutoff}
	if run_parallel:
		incoming = Parallel(n_jobs=8,verbose=10 if debug else 0)(
			delayed(compute_function,has_shareable_memory)(fr,**out_args) 
			for fr in framelooper(nframes,start=start))
	else: 
		incoming = []
		for fr in framelooper(nframes):
			incoming.append(compute_function(fr,**out_args))

	#---chompdown
	#---get valid frames
	valid_frames = np.where([len(i['subjects'])>0 for i in incoming])[0]
	obs_by_frames = np.array([len(incoming[i]['subjects']) for i in valid_frames]).astype(int)
	#---concatenate the donor/acceptor indices across all frames
	subject_cat = np.concatenate([incoming[i]['subjects'] for i in valid_frames]).astype(int)
	target_cat = np.concatenate([incoming[i]['targets'] for i in valid_frames]).astype(int)

	start_time = time.time()
	#---tabulate each bond observation
	tabulation = np.transpose((subject.resnames[subject_cat],subject.resids[subject_cat],
		subject.names[subject_cat],targets.resnames[target_cat],targets.resids[target_cat],
		targets.names[target_cat],))
	status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute')

	idx,counts = uniquify(tabulation.astype(str))
	bonds_catalog = tabulation[idx]

	start_time = time.time()
	#---preallocate bond counts per frame
	counts_per_frame = np.zeros((len(valid_frames),len(idx)))
	#---hash the binds over the indices
	bonds_to_idx = dict([(tuple(b),bb) for bb,b in enumerate(bonds_catalog)])
	frame_lims = np.concatenate(([0],np.cumsum(obs_by_frames)))
	for fr,i in enumerate(frame_lims[:-1]):
		status('counting observations per frame',i=fr,looplen=len(valid_frames),
			tag='compute',start=start_time)
		obs_this = tabulation[frame_lims[fr]:frame_lims[fr+1]]
		counts_per_frame[fr][np.array([bonds_to_idx[tuple(o)] for o in obs_this])] += 1
	status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute')
	status('done heavy lifting',tag='compute')
	#---note the size of the outgoing data. we could shrink this by discarding atom names
	status('observation array for cutoff %.1f is %.1fMB'%(
		distance_cutoff,sys.getsizeof(counts_per_frame)/10**6.),tag='note')

	#---package the dataset
	result,attrs = {},{}
	#---everything is indexed by idx
	result['bonds'] = bonds_catalog
	result['observations'] = counts_per_frame
	result['valid_frames'] = valid_frames
	result['nframes'] = np.array(nframes)
	result['resnames'] = resnames_master
	result['subject_residues_resnames'] = subject.residues.resnames
	result['targets_residues_resnames'] = targets.residues.resnames
	result['subject_residues_resids'] = subject.residues.resids
	result['nmols'] = rescounts
	result['times'] = np.array(times)

	#---some basic post-processing common to many of the plots
	global bonds,obs
	bonds,obs = bonds_catalog,counts_per_frame
	#---post: generate timewise trajectories for the number of contacts between protein residues and lipids
	#---methodology note: in a basic version of this calculation we simply count all of the bonds between 
	#---...any lipid-protein residue pair. this means that being more close to a lipid might result in more 
	#---...contacts and hence generates a higher score. hence we have two versions of the calculation. one 
	#---...counts the total number of contacts, and the other discards atom information and scores contacts 
	#---...with a maximum of one per protein residue-lipid pair. this calculation does both
	#---! need to check for atom-name resolution otherwise this is moot
	resids = result['subject_residues_resids']
	lipid_resnames = np.unique(bonds[:,rowspec.index('target_resname')])
	resname_combos = [(r,np.array([r])) for r in lipid_resnames]+[('all lipids',np.array(lipid_resnames))]
	#---compute loop
	looper = [{'resid':resid,'resname_set':resname_set} 
		for resid in resids for resname_name,resname_set in resname_combos]
	compute_function = count_reduced_contact
	incoming = basic_compute_loop(compute_function,looper,run_parallel=run_parallel)
	#---package this as a list of resid/resname pairs and the counts for them
	result['pairs_resid_resname'] = np.array([(resid,resname_name) 
		for resid in resids for resname_name,resname_set in resname_combos]).astype(str)
	result['counts_resid_resname'] = np.array(incoming)
	#---reduce the data for the modified count described above
	global bonds_red
	bonds_red = bonds[:,np.array([0,1,3,4])]
	compute_function = count_reduced_contact_reduced
	incoming = basic_compute_loop(compute_function,looper,run_parallel=run_parallel)
	result['counts_resid_resname_singleton'] = np.array(incoming)

	#---debugging the explicit method used in the function above
	if False:
		resid,resname_set = 2,['POP2']
		which = np.where(np.all((bonds[:,rowspec.index('subject_resid')].astype(int)==resid,np.in1d(bonds[:,rowspec.index('target_resname')],resname_set)),axis=0))
		obs.T[which].sum(axis=0)
		bonds[which]
	#---debugging the reduced method used in the function above
	if False:
		resid,resname_set = 2,['POP2']
		which = np.where(np.all((bonds_red[:,rowspec_red.index('subject_resid')].astype(int)==resid,np.in1d(bonds_red[:,rowspec_red.index('target_resname')],resname_set)),axis=0))
		(obs.T[which].sum(axis=0)>0)*1
		bonds_red[which]
		#---! added this after discovering a contradiction in the results
		idx,counts = uniquify(bonds_red[which].astype(str))
		bonds_red[which][idx]
		obs.T[which][idx].sum(axis=0)

	status('compute job lasted %.1fmin'%((time.time()-start_job_time)/60.),tag='time')
	return result,attrs
Ejemplo n.º 5
0
def hydrogen_bonding(grofile, trajfile, **kwargs):
    """
	Generic hydrogen bonding code.
	Revamped on 2017.4.28 to generate a more uniform data structure.
	"""

    #---unpack
    sn = kwargs['sn']
    work = kwargs['workspace']
    calc = kwargs['calc']
    debug = kwargs.get('debug', False)
    run_parallel = kwargs.get('run_parallel', True)

    #---prototyping an external module import to replace the sometimes tedious addition of
    #---... many metadata to the meta dictionary in your YAML files.
    protein_itp_loader = work.vars.get('protein_itp_loader', None)
    #---default ITP paths are set for each simulation in the metadata
    if protein_itp_loader == None:

        def protein_itp_loader(sn, **kwargs):
            itp = work.meta.get(sn, {}).get('protein_itp', None)
            if not itp:
                raise Exception(
                    'cannot find protein_itp in meta dictionary for %s ' % sn +
                    'note that you can also use the protein_itp_loader functionality to get the ITP file'
                )
    #---custom ITP loader specified as an alternate_module
    else:

        protein_itp_loader = alternate_module(**protein_itp_loader)

    #---settings
    distance_cutoff, angle_cutoff = [
        calc['specs'][i] for i in ['distance_cutoff', 'angle_cutoff']
    ]
    #---cutoff for inferring hydrogens from a one-time distance search
    distance_h_cutoff = distance_cutoff

    #---prepare universe
    uni = MDAnalysis.Universe(grofile, trajfile)
    nframes = len(uni.trajectory)
    lenscale = 10.
    start_job_time = time.time()

    #---save topology for later
    _, idx, counts = np.unique(uni.residues.resnames,
                               return_index=True,
                               return_counts=True)
    resnames = uni.residues.resnames[np.sort(idx)]
    resnames_master = np.array(resnames)
    rescounts = counts[np.argsort(idx)]

    import makeface
    #---get an automacs landscape with a little help from the user
    try:
        mod = makeface.import_remote('amx/amx')
    except:
        raise Exception(
            'please clone a copy of automacs next to omni in `amx`. '
            'you must also run `make setup all` from that directory to get force field files.'
        )
    mod['state'].force_field = 'charmm'
    Landscape = mod['Landscape']
    land = Landscape(cwd='amx/')
    #---use the landscape to get hydrogen bond donors and acceptors for lipids
    hydrogen_bond_ref = {}
    #---this section relies on correct definitions from the Landscape
    targets = land.objects_by_category('lipid')
    #---METHODOLOGY NOTE: we catalog all hydrogen bonding opportunities ONLY BY NAME
    #---loop over lipid targets and scan them for hydrogen bond opportunities
    for resname in targets:
        #---each lipid ITP has an identical molecule with the same (residue) name
        itp = mod['GMXTopology'](land.objects[resname]['fn'])
        #---donor names come from a double-regex match over bonds
        donor_names = itp.get_bonds_by_regex(molname=resname,
                                             patterns=['^H', '^(N|O|S)'])
        #---acceptor names have a single regex
        #---!!! check that this is the correct definition
        acceptor_names = [
            i['atom'] for i in itp.molecules[resname]['atoms']
            if re.match('^(N|O|S)', i['atom'])
        ]
        hydrogen_bond_ref[resname] = {
            'acceptors': acceptor_names,
            'donors': donor_names
        }
    #---include any proteins as participants in the bonding
    if kwargs['calc']['specs'].get('protein', False):
        #---get the protein ITP from metadata
        itp_fn = work.meta[sn].get('protein_itp',
                                   protein_itp_loader(sn, work=work))
        if not itp_fn:
            raise Exception('add protein_itp to the meta for %s' % sn)
        #---get the sims spot path systematically
        if os.path.isfile(eval(itp_fn)):
            itp_fn_abs = eval(itp_fn)
            #---if path is relative then we consult the spots
        else:
            rootdir = work.raw.spots[(work.raw.spotname_lookup(sn),
                                      'structure')]['rootdir']
            sn_dir = os.path.join(rootdir, sn)
            #---user supplies step folder and path to the reference structure
            itp_fn_abs = os.path.join(sn_dir, itp_fn)
        protein_itp = mod['GMXTopology'](itp_fn_abs)
        for molname in protein_itp.molecules:
            #---mimic the procedure above for lipids
            #---donor names come from a double-regex match over bonds
            donor_resnames_names = protein_itp.get_bonds_by_regex(
                molname=molname,
                patterns=['^H', '^(N|O|S)'],
                include_resname=True)
            #---organize hydrogen bonds by residue name
            resnames_all = list(
                set([i for j in zip(*donor_resnames_names)[0] for i in j]))
            for resname_focus in resnames_all:
                donor_list = []
                #---loop over resnames within the protein
                for resnames, names in donor_resnames_names:
                    if resnames[0] != resnames[1]:
                        raise Exception('invalid hydrogen bond spec %s,%s' %
                                        (resnames, names))
                    elif resnames[0] == resname_focus:
                        donor_list.append(names)
                    else:
                        continue
                #---acceptor names have a single regex
                #---!!! check that this is the correct definition
                acceptor_names = list(
                    set([
                        i['atom']
                        for i in protein_itp.molecules[molname]['atoms']
                        if re.match('^(N|O|S)', i['atom'])
                        and i['resname'] == resname_focus
                    ]))
                hydrogen_bond_ref[(molname, resname_focus)] = {
                    'acceptors': acceptor_names,
                    'donors': donor_list
                }
    """
	developing a new method for selecting the atoms correctly
	we need to get all possible donors into a big selection after which case the hbonds.hbonder_framewise
		does the rest and the plotting codes are decent at picking out inter-residue bonds and identities 
	consider the customer: hbonds_framewise 
		needs a list of donors, hydrogens, and acceptors
		since the donors and hydrogens must be related by bonds
			there is some redundancy in the donor list
			which redundancy hbonds_framewise accounts for
	construct the donors list
		start with the list of all hydrogens (somewhat counterintuitive)
		consult the hydrogen_bond_ref and loop over all residues and then grow a list of hydrogen indices
		for each hydrogen find the associated heavy atom and and add both indices to separate lists
		net result is two lists of indices over the hydrogens and donors which constitute a bond
	"""

    #---get the heavy atom side of the donors
    donors_heavy, donors_h = [
        uni.select_atoms(' or '.join([
            '(resname %s and (%s))' %
            (resname if type(resname) in str_types else resname[1],
             ' or '.join(['name %s' % i[w] for i in v['donors']]))
            for resname, v in hydrogen_bond_ref.items() if v['donors']
        ])) for w in range(2)
    ]
    acceptors_heavy = uni.select_atoms(' or '.join([
        '(resname %s and (%s))' %
        (resname if type(resname) in str_types else resname[1], ' or '.join(
            ['name %s' % i for i in v['acceptors']]))
        for resname, v in hydrogen_bond_ref.items() if v['acceptors']
    ]))

    #---check non-redundany residues
    if not len(donors_heavy.residues) == len(np.unique(donors_heavy.resids)):
        raise Exception('residue redundancy in the donor heavy list')
    if not len(donors_h.residues) == len(np.unique(donors_h.resids)):
        raise Exception('residue redundancy in the donor hydrogen list')
    #---constructing the donors side selection to preserve the bond relation
    donors_reindex = []
    for refkey, details in hydrogen_bond_ref.items():
        #---protein residues have the protein molecule name alongside
        resname = refkey if type(refkey) in str_types else refkey[1]
        for heavy, light in details['donors']:
            inds_heavy = np.where(
                np.all((donors_heavy.resnames == resname, donors_heavy.names
                        == heavy),
                       axis=0))[0]
            inds_light = np.where(
                np.all((donors_h.resnames == resname, donors_h.names == light),
                       axis=0))[0]
            #---loop over resids and for each resid that has them, we add the indices to the list
            #---! these descending loops are clumsy but they should be fast and they make definitions precise
            for resid in np.unique(
                    np.concatenate((donors_heavy[inds_heavy].resids,
                                    donors_h[inds_light].resids))):
                inds_heavy = np.where(
                    np.all(
                        (donors_heavy.resnames == resname, donors_heavy.names
                         == heavy, donors_heavy.resnums == resid),
                        axis=0))[0]
                inds_light = np.where(
                    np.all((donors_h.resnames == resname, donors_h.names
                            == light, donors_h.resnums == resid),
                           axis=0))[0]
                if len(inds_heavy) > 1 or len(inds_light) > 1:
                    raise Exception(
                        'serious error! one unique hydrogen bond in a single residue'
                    )
                if len(inds_heavy) == 1 and len(inds_light) == 1:
                    donors_reindex.append((inds_heavy[0], inds_light[0]))
    #---the reindexed donors preserved the bond relation and covers all possible unique hydrogen bonds
    donors_reindex = np.array(donors_reindex)

    #---prepare coordinates for each frame
    st = time.time()
    vecs, all_donor_coords, all_acceptor_coords, all_h_coords = [], [], [], []
    #---purposefully profligate with the memory so this goes quickly
    for fr in range(nframes):
        status('caching coordinates',
               tag='compute',
               i=fr,
               looplen=nframes,
               start=st)
        uni.trajectory[fr]
        vecs.append(uni.dimensions[:3] / lenscale)
        all_donor_coords.append(donors_heavy.positions[donors_reindex[:, 0]] /
                                lenscale)
        all_h_coords.append(donors_h.positions[donors_reindex[:, 1]] /
                            lenscale)
        all_acceptor_coords.append(acceptors_heavy.positions / lenscale)
    status('completed caching in %.1f minutes' % ((time.time() - st) / 60.),
           tag='status')

    #---export variables
    from codes import hbonds_framewise
    hbonds_framewise.hydrogen_bond_ref = hydrogen_bond_ref
    hbonds_framewise.all_donor_coords = all_donor_coords
    hbonds_framewise.all_acceptor_coords = all_acceptor_coords
    hbonds_framewise.all_h_coords = all_h_coords
    hbonds_framewise.vecs = vecs

    #---debug
    if debug:
        hbonds.donors_side = donors_side
        hbonds.donors_inds = donors_inds
        hbonds.donors_inds = donors_inds
        hbonds.acceptors_side = acceptors_side
        fr = 686  #---careful debugging at this frame
        incoming = hbonds_framewise.hbonder_framewise(
            fr, distance_cutoff=distance_cutoff, angle_cutoff=angle_cutoff)
        sys.quit()

    start = time.time()
    out_args = {
        'distance_cutoff': distance_cutoff,
        'angle_cutoff': angle_cutoff
    }
    if run_parallel:
        incoming = Parallel(n_jobs=8, verbose=10 if debug else 0)(
            delayed(hbonds_framewise.hbonder_framewise, has_shareable_memory)(
                fr, **out_args) for fr in framelooper(nframes, start=start))
    else:
        incoming = []
        for fr in framelooper(nframes):
            incoming.append(hbonds_framewise.hbonder_framewise(fr, **out_args))

    #---get valid frames
    valid_frames = np.where([len(i['donors']) > 0 for i in incoming])[0]
    #---concatenate the donor/acceptor indices across all frames
    donor_cat, donor_cat_h = [
        np.concatenate(
            [donors_reindex.T[j][incoming[i]['donors']]
             for i in valid_frames]).astype(int) for j in range(2)
    ]
    acceptor_cat = np.concatenate(
        [incoming[i]['acceptors'] for i in valid_frames]).astype(int)
    obs_by_frames = np.array(
        [len(incoming[i]['acceptors']) for i in valid_frames]).astype(int)

    start_time = time.time()
    #---tabulate each bond observation
    status('sluggish sequence because there are {:,} bond observations'.format(
        len(donor_cat)),
           tag='warning')
    status('tabulating all distinct hydrogen bonds', tag='compute')
    tabulation = np.transpose((
        donors_heavy.resnames[donor_cat],
        donors_heavy.resids[donor_cat],
        donors_heavy.names[donor_cat],
        acceptors_heavy.resnames[acceptor_cat],
        acceptors_heavy.resids[acceptor_cat],
        acceptors_heavy.names[acceptor_cat],
        #---include the hydrogen identity here in the tabulation (note this might make things larger?)
        #---also note that the hydrogen atom name should be enough because we already have the donor resid
        donors_h.names[donor_cat_h],
    ))
    status('stopwatch: %.1fs' % (time.time() - start_time), tag='compute')

    #---reduce tabulation by discarding all SOL-SOL bonds
    #---...note that this is necessary because we have 33M observations and almost all of them are "unique"
    #---...precisely because so many of them involve water
    #---actually, instead of discarding, let us change all waters to a single residue
    tabulation_explicit = tabulation
    tabulation = np.array(tabulation_explicit)
    for p in [0, 3]:
        sols = np.where(tabulation[:, p] == 'SOL')[0]
        tabulation[(sols, (np.ones((len(sols))) * (p + 1)).astype(int))] = '1'

    start_time = time.time()
    status('unique-ifying the tabulated bonds (estimated %ds)' %
           (len(donor_cat) * 1.3 * 10**-6),
           tag='compute')
    status(
        'note: with 32GB memory, 33M observations works fine, but 46M hits the swap',
        tag='warning')
    #---note that unique is getting "axis" in np 1.13 but at some point on or before 1.12 they added some
    #---...kind of a safety check on the following trick for unique rows, which check returns an error
    #---...message: "TypeError: Cannot change data-type for object array." which is solved by forcing
    #---...the object to a string type. note that this method requires void and not a blank string, which
    #---...some examples will use. this changed must have happened in the <1 week since we wrote
    #---...the hydrogen bonds code and tested it again on the factory
    #---uniquify the enormous list of all possible hydrogen bonds
    tabulation_reform = tabulation.astype(str)
    tabulation_unique = np.ascontiguousarray(tabulation_reform).view(
        np.dtype(
            (np.void,
             tabulation_reform.dtype.itemsize * tabulation_reform.shape[1])))
    tabulation_view_unique, idx, counts = np.unique(tabulation_unique,
                                                    return_index=True,
                                                    return_counts=True)
    bonds = tabulation[idx]
    status('stopwatch: %.1fs' % (time.time() - start_time), tag='compute')

    start_time = time.time()
    #---preallocate bond counts per frame
    counts_per_frame = np.zeros((len(valid_frames), len(idx)))
    #---hash the binds over the indices
    bonds_to_idx = dict([(tuple(b), bb) for bb, b in enumerate(bonds)])
    frame_lims = np.concatenate(([0], np.cumsum(obs_by_frames)))
    for fr, i in enumerate(frame_lims[:-1]):
        status('counting observations',
               i=fr,
               looplen=len(valid_frames),
               tag='compute',
               start=start_time)
        obs = tabulation[frame_lims[fr]:frame_lims[fr + 1]]
        counts_per_frame[fr][np.array([bonds_to_idx[tuple(o)]
                                       for o in obs])] += 1
    status('stopwatch: %.1fs' % (time.time() - start_time), tag='compute')
    status('done heavy lifting', tag='compute')

    #---package the dataset
    result, attrs = {}, {}
    #---everything is indexed by idx
    result['bonds'] = bonds
    result['observations'] = counts_per_frame
    result['valid_frames'] = valid_frames
    result['nframes'] = np.array(nframes)
    result['resnames'] = resnames_master
    result['nmols'] = rescounts
    status('compute job lasted %.1fmin' %
           ((time.time() - start_job_time) / 60.),
           tag='time')
    return result, attrs
Ejemplo n.º 6
0
def salt_bridges(grofile, trajfile, **kwargs):
    """
	Identify salt bridges. Mimics the beginning of the hydrogen bond
	"""

    #---unpack
    sn = kwargs['sn']
    work = kwargs['workspace']
    calc = kwargs['calc']
    debug = kwargs.get('debug', False)
    run_parallel = kwargs.get('run_parallel', True)

    #---settings. distance cutoff is larger for salt bridges than hydrogen bonds
    distance_cutoff = calc['specs']['distance_cutoff']

    #---prepare universe
    uni = MDAnalysis.Universe(grofile, trajfile)
    nframes = len(uni.trajectory)
    lenscale = 10.
    start_job_time = time.time()

    #---save topology for later
    _, idx, counts = np.unique(uni.residues.resnames,
                               return_index=True,
                               return_counts=True)
    resnames = uni.residues.resnames[np.sort(idx)]
    rescounts = counts[np.argsort(idx)]

    import makeface
    #---get an automacs landscape
    #---! DEV. needs a clone and make to work
    try:
        mod = makeface.import_remote('amx/amx')
    except:
        raise Exception(
            'please clone a copy of automacs next to omni in `amx`')
    mod['state'].force_field = 'charmm'
    Landscape = mod['Landscape']
    land = Landscape(cwd='amx/')
    #---use the landscape to get hydrogen bond donors and acceptors for lipids
    hydrogen_bond_ref = {}
    targets = land.objects_by_category('lipid')
    for resname in targets:
        mol = land.itps[land.objects[resname]['fn']][resname]
        #---collect all possible hydrogen bond acceptors
        #---! forbid water here for salt bridge calculation
        #! it was an error to redefine acceptor names here because they get overwritten!
        acceptor_names = [
            i['atom'] for i in mol['atoms']
            if re.match('^(N|O|S)', i['atom']) and i != 'OW'
        ]
        #! note that I am preempting the bug here. previously used some logic to get hydrogen
        #! the following assumes that both donors and acceptors are NOS.
        #! NOTE THAT THE POINT OF THIS CALCULATION IS THAT WE HAVE CATIONS BRIDGING TWO NEGATIVE
        #!   HEAVY ATOMS AND THERE ARE VERY FEW POSITIVE HEAVY CHARGES IN LIPIDS ANYWAY
        #!   ONLY EXCEPTION IS DOPE ETHANOLAMINE
        donor_names = [
            i['atom'] for i in mol['atoms']
            if re.match('^(N|O|S)', i['atom']) and i != 'OW'
        ]
        hydrogen_bond_ref[resname] = {
            'acceptors': acceptor_names,
            'donors': donor_names
        }
    #---water-naming is hard-coded
    #! hydrogen_bond_ref['water'] = {'donors':[('OW','HW1'),('OW','HW2')],'acceptors':['OW']}
    #---assemble the names
    #! note that there were two bugs: first I was using hydrogens for a salt bridge calculation (!)
    #!   and second this was using the zero even though the bonds could be symmetric. the solution was to
    #!   search through mol['bonds'] and the reversed
    #donors_names = sorted(list(set([m for n in [zip(*i['donors'])[0]
    #	for i in hydrogen_bond_ref.values() if i['donors']!=[]] for m in n])))
    donors_names = sorted(
        list(
            set([
                m for n in
                [i['donors'] for i in hydrogen_bond_ref.values() if i != []]
                for m in n
            ])))
    #hydrogens_names = sorted(list(set([m for n in [zip(*i['donors'])[1]
    #	for i in hydrogen_bond_ref.values() if i['donors']!=[]] for m in n])))
    acceptors_names = sorted(
        list(
            set([
                m for n in [
                    i['acceptors'] for i in hydrogen_bond_ref.values()
                    if i != []
                ] for m in n
            ])))
    #---generate atom groups
    donors = uni.select_atoms(' or '.join(
        ['name %s' % i for i in donors_names]))
    acceptors = uni.select_atoms(' or '.join(
        ['name %s' % i for i in acceptors_names]))
    #hydrogens = uni.select_atoms(' or '.join(['name %s'%i for i in hydrogens_names]))

    #---! not necessary
    if False:

        #---we can either exclude water here or after the KD-Tree. tried the latter and there was index problem
        #---! note that we exclude water by ignoring OW and HW1 and HW2. they should not appear in other mol
        hydrogen_bond_ref = dict([(i, j) for i, j in hydrogen_bond_ref.items()
                                  if i != 'water'])
        donors_h_pairs = [
            m
            for n in [i.get('donors', []) for i in hydrogen_bond_ref.values()]
            for m in n
        ]
        donors_h_pairs_flat = list(set([i for j in donors_h_pairs for i in j]))
        #---compared to the hydrogen bonding version, we only take the heavy atom the in donor pairs

        sel_d, sel_h = [
            uni.select_atoms(' or '.join(
                ['name %s' % i for i in list(set(zip(*donors_h_pairs)[j]))]))
            for j in range(2)
        ]
        resids = np.unique(np.concatenate([sel_d.resids, sel_h.resids]))
        donors_side = uni.select_atoms(' or '.join(
            ['name %s' % i for i in donors_h_pairs_flat]))
        donors_resids = np.unique(donors_side.resids)

    #---! use acceptor names for both donor and acceptor pairs for this calculation. that is, we do not
    #---! ...require an intervening hydrogen
    #! incorrectly used acceptors_names. fixed now
    donors_side = uni.select_atoms(' or '.join(
        ['name %s' % i for i in acceptors_names]))
    donors_resids = np.unique(donors_side.resids)

    #---! not necessary
    if False:
        #---identifying residues with both a donor and a corresponding hydrogen
        both = np.zeros((len(resids), len(donors_h_pairs), 2), dtype=bool)
        alles = np.array(
            [[donors_side.names == i for i in zip(*donors_h_pairs)[j]]
             for j in range(2)])
        #---lookups for different atom names (fast because there are relatively few atom names)
        subsels = [[np.where(i)[0] for i in alles[j]] for j in range(2)]

        #---loop over heavy/light types
        for anum in range(2):
            #---loop over possible pairs
            for pnum in range(len(alles[anum])):
                #---crucial conversion back to zero-numbering from resids here
                both[donors_side.resids[subsels[anum][pnum]] - 1, pnum,
                     anum] = True

        #---use all to find out which residues have which opportunities for bonding
        bond_opps = np.transpose(np.where(np.all(both, axis=2)))

        #---some hydrogen bonds have the same donors for multiple hydrogens
        donors_inds = np.zeros((2, len(bond_opps))).astype(int)
        for anum in range(2):
            donors_names_u = np.unique(zip(*donors_h_pairs)[anum])
            #---for each bond opportunity, we list the heavy donor
            donors_side_names = np.array(donors_h_pairs).T[anum][bond_opps[:,
                                                                           1]]
            #---convert this into index (this is fast because it is over a short list of donor names)
            donors_side_inds = -1 * np.ones(len(donors_side_names)).astype(int)
            for nn, n in enumerate(donors_names_u):
                donors_side_inds[np.where(donors_side_names == n)] = nn
            #---lookup from residue and unique heavy donor atom to absolute index in donors_side
            lookup = len(donors_names_u) * np.ones(
                (len(donors_resids), len(donors_names_u) + 1)).astype(int)
            #---convert this into index (this is fast because it is over a short list of donor names)
            donors_side_names_inds = len(donors_names_u) * np.ones(
                (len(donors_side.names))).astype(int)
            for nn, n in enumerate(donors_names_u):
                donors_side_names_inds[np.where(donors_side.names == n)] = nn
            lookup[tuple(np.transpose([donors_side.resids-1,donors_side_names_inds]).T)] = \
             np.arange(len(donors_side.resids))
            #---translate bond_opps from pair numbering to heavy donor numbering (which is unique)
            bond_opps_unique = np.array(bond_opps)
            bond_opps_unique[:, 1] = donors_side_inds
            donors_inds[anum] = lookup[tuple(bond_opps_unique.T)]

    #---prepare the acceptors selections
    acceptors_names = np.unique([
        j
        for k in [i.get('acceptors', []) for i in hydrogen_bond_ref.values()]
        for j in k
    ])
    #---! use acceptor_names not acceptors_names
    #! NO THAT IS WRONG
    acceptors_side = uni.select_atoms(' or '.join(
        ['name %s' % i for i in acceptors_names]))

    #---extend to include salt bridges
    #---some systems have two types of cations
    cation_names = work.meta[sn].get('cations',
                                     work.meta[sn].get('cation', None))
    if not cation_names:
        raise Exception('add "cations" to the meta dictionary for %s' % sn)
    if type(cation_names) != list: cation_names = [cation_names]
    multiple_cations = len(cation_names) > 1
    cations_side = uni.select_atoms(' or '.join(
        ['name %s' % i for i in cation_names]))

    #---prepare coordinates for each frame
    st = time.time()
    vecs,all_donor_coords,all_acceptor_coords,all_h_coords,all_cation_coords = [],[],[],[],[]
    #---purposefully profligate with the memory so this goes quickly
    for fr in range(nframes):
        status('caching coordinates',
               tag='compute',
               i=fr,
               looplen=nframes,
               start=st)
        uni.trajectory[fr]
        vecs.append(uni.dimensions[:3] / lenscale)
        all_donor_coords.append(donors_side.positions / lenscale)
        all_h_coords.append(donors_side.positions / lenscale)
        all_acceptor_coords.append(acceptors_side.positions / lenscale)
        all_cation_coords.append(cations_side.positions / lenscale)
    status('completed caching in %.1f minutes' % ((time.time() - st) / 60.),
           tag='status')
    #---the preceding code is identical to the beginning of hydrogen_bonding
    #---export variables
    from codes import hbonds
    hbonds.hydrogen_bond_ref = hydrogen_bond_ref
    hbonds.all_donor_coords = all_donor_coords
    hbonds.all_acceptor_coords = all_acceptor_coords
    hbonds.all_h_coords = all_h_coords
    hbonds.all_cation_coords = all_cation_coords
    hbonds.vecs = vecs

    #---extra exports for development
    hbonds.acceptors_resids = acceptors_side.resids
    hbonds.acceptors_resnames = acceptors_side.resnames
    hbonds.donors_resids = donors_side.resids
    hbonds.donors_resnames = donors_side.resnames

    #---debug
    if debug:
        fr = 36
        incoming_salt = hbonds.salt_bridges_framewise(
            fr, distance_cutoff=distance_cutoff)
        import ipdb
        ipdb.set_trace()
        sys.quit()

    start = time.time()
    out_args = {'distance_cutoff': distance_cutoff}
    if run_parallel:
        #! use require='sharedmem' instead of delayed(func,has_shareable_memory) for late era joblib
        incoming_salt = Parallel(
            n_jobs=4, verbose=10 if debug else 0, require='sharedmem')(
                delayed(hbonds.salt_bridges_framewise)(fr, **out_args)
                for fr in framelooper(nframes, start=start))
    else:
        incoming, incoming_salt = [], []
        for fr in framelooper(nframes):
            incoming_salt.append(hbonds.salt_bridges_framewise(fr, **out_args))

    #---extension to salt bridges. tabulate each salt
    valid_frames_salt = np.array(
        [ii for ii, i in enumerate(incoming_salt) if len(i) > 0])
    obs_by_frames_salt = np.array([
        len(i) for ii, i in enumerate(incoming_salt) if len(i) > 0
    ]).astype(int)
    #---some simulations have no salt bridges
    if len(valid_frames_salt) == 0:
        bonds_salt, counts_per_frame_salt = np.array([]), np.array([])
    else:
        salt_cat = np.concatenate(
            [incoming_salt[i] for i in valid_frames_salt])
        status('tabulating all distinct salt bridges', tag='compute')
        status('sluggish sequence because there are {:,} bond observations'.
               format(len(salt_cat)),
               tag='warning')

        #---having excluded water we are small enough to add back the ion name
        tabulation_salt = np.transpose((
            acceptors_side.resnames[salt_cat[:, 0]],
            acceptors_side.resids[salt_cat[:, 0]],
            acceptors_side.names[salt_cat[:, 0]],
            donors_side.resnames[salt_cat[:, 2]],
            donors_side.resids[salt_cat[:, 2]],
            donors_side.names[salt_cat[:, 2]],
            #cations_side[salt_cat[:,1]].resids,
            #cations_side[salt_cat[:,1]].resnames,
        ))
        #---send the hydrogen bonds to the tabulator
        tabulation_salt_out = tabulation_salt
        bonds_salt, counts_per_frame_salt = tabulator(tabulation_salt_out,
                                                      valid_frames_salt,
                                                      obs_by_frames_salt)

        #---!!! development note. data are too big so we discard cation data in the tabulation
        #---!!! ...which means that

    #---package the dataset
    result, attrs = {}, {}
    #---everything is indexed by idx
    result['bonds'] = bonds_salt
    result['observations'] = counts_per_frame_salt
    result['bonds_salt'] = bonds_salt
    result['valid_frames'] = valid_frames_salt
    result['nframes'] = np.array(nframes)
    result['resnames'] = resnames
    result['nmols'] = rescounts
    status('compute job lasted %.1fmin' %
           ((time.time() - start_job_time) / 60.),
           tag='time')
    return result, attrs
Ejemplo n.º 7
0
				rotation=0,ha="center",va="center",color='k',
				fontsize=fsbase-4)
			tb.set_path_effects([path_effects.Stroke(linewidth=4,foreground='w'),
				path_effects.Normal()])
			counter += 1

	#---saving the snapshot tag here so we can keep track of the fix to exemplar_lipid above, fixed on v4
	picturesave('fig.head_angle_detail',work.plotdir,backup=False,version=True,
		meta={'tag':tag_head_angle,'exemplar_rank':exemplar_rank},extras=patches)
	plt.close()

###---CODE REORGY

import makeface
#---get an automacs landscape with a little help from the user
try: mod = makeface.import_remote('amx/amx')
except: raise Exception('please clone a copy of automacs next to omni in `amx`. '
	'you must also run `make setup all` from that directory to get force field files.')
mod['state'].force_field = 'charmm'
GMXStructure = mod['GMXStructure']

def mapback(seq):
	"""Hash a list of numbers back to their indices."""
	#---this function is a candidate for omni/base/tools.py
	return dict([(v,k) for k,v in zip(np.arange(len(seq)),seq)])

#---custom water coloring
custom_water_coloring = """
set vdw_thick 0.4
set inner_cutoff 2.0
set outer_cutoff 5.0
Ejemplo n.º 8
0
def contacts(grofile,trajfile,**kwargs):

	"""
	Identify, catalog, and count contacts in a simulation.
	Note that this code was developed to mimic the data structures used by hydrogen bonding and salt bridging
	codes, and stores the various contacts up to a very high level of specificity (i.e. the specific residue
	and atom names).
	"""

	#---unpack
	sn = kwargs['sn']
	work = kwargs['workspace']
	calc = kwargs['calc']
	debug = kwargs.get('debug',False)
	run_parallel = kwargs.get('run_parallel',True)

	#---settings
	lenscale = 10.0
	#---distance cutoff stays in angstroms until the compute function
	distance_cutoff = calc['specs']['cutoff']
	subject_selection = calc['specs'].get('subject','protein')
	object_flag = calc['specs'].get('object','lipid')

	#---prepare universe	
	uni = MDAnalysis.Universe(grofile,trajfile)
	nframes = len(uni.trajectory)
	start_job_time = time.time()

	#---save topology for later
	_,idx,counts = np.unique(uni.residues.resnames,return_index=True,return_counts=True)
	resnames = uni.residues.resnames[np.sort(idx)]
	resnames_master = np.array(resnames)
	rescounts = counts[np.argsort(idx)]

	import makeface
	#---get an automacs landscape with a little help from the user
	try: mod = makeface.import_remote('amx/amx')
	except: raise Exception('please clone a copy of automacs next to omni in `amx`. '
		'you must also run `make setup all` from that directory to get force field files.')
	ff_name = work.vars.get('force_field',None)
	if not ff_name: raise Exception('we must be very careful with the residue naming. '
		'you must add `force_field` to the `variables` dictionary in your metadata to continue.')
	mod['state'].force_field = 'charmm'
	Landscape = mod['Landscape']
	land = Landscape(cwd='amx/',ff=ff_name)

	#---get the subject of the calculation, the thing we wish to study the contacts of
	#---...typically the protein
	#---! need to add resid redundancy checks possibly
	subject = uni.select_atoms(subject_selection)
	#---get the objects
	if object_flag=='lipid':
		#---objects from the landscape returns resnames
		target_resnames = land.objects_by_category('lipid')
		#---explicitly ignore hydrogen contacts here
		targets = uni.select_atoms('(%s) and not name H*'%
			' or '.join(['resname %s'%i for i in target_resnames]))
	elif object_flag=='protein':
		target_resnames = np.unique(uni.residues.resnames)
		#---explicitly ignore hydrogen contacts here
		targets = uni.select_atoms('(%s) and not name H*'%
			' or '.join(['resname %s'%i for i in target_resnames]))
	else: raise Exception('not set up for object %s'%object_flag)

	#---prepare coordinates for each frame
	st = time.time()
	global vecs,coords_subj,coords_targ
	vecs,coords_subj,coords_targ,times = [],[],[],[]
	#---purposefully profligate with the memory so this goes quickly
	for fr in range(nframes):
		status('caching coordinates',tag='compute',i=fr,looplen=nframes,start=st)	
		uni.trajectory[fr]
		times.append(uni.trajectory.time)
		vecs.append(uni.dimensions[:3]/lenscale)
		coords_subj.append(subject.positions/lenscale)
		coords_targ.append(targets.positions/lenscale)
	status('completed caching in %.1f minutes'%((time.time()-st)/60.),tag='status')

	#---debug
	compute_function = contacts_framewise
	if debug:
		fr = 50
		incoming = compute_function(fr,distance_cutoff=distance_cutoff)
		import ipdb;ipdb.set_trace()
		sys.quit()

	#---compute loop
	start = time.time()
	out_args = {'distance_cutoff':distance_cutoff}
	if run_parallel:
		incoming = Parallel(n_jobs=8,verbose=10 if debug else 0)(
			delayed(compute_function,has_shareable_memory)(fr,**out_args) 
			for fr in framelooper(nframes,start=start))
	else: 
		incoming = []
		for fr in framelooper(nframes):
			incoming.append(compute_function(fr,**out_args))

	#---chompdown
	#---get valid frames
	valid_frames = np.where([len(i['subjects'])>0 for i in incoming])[0]
	obs_by_frames = np.array([len(incoming[i]['subjects']) for i in valid_frames]).astype(int)
	#---concatenate the donor/acceptor indices across all frames
	subject_cat = np.concatenate([incoming[i]['subjects'] for i in valid_frames]).astype(int)
	target_cat = np.concatenate([incoming[i]['targets'] for i in valid_frames]).astype(int)
	start_time = time.time()
	#---tabulate each bond observation
	tabulation = np.transpose((subject.resnames[subject_cat],subject.resids[subject_cat],
		subject.names[subject_cat],targets.resnames[target_cat],targets.resids[target_cat],
		targets.names[target_cat],))
	status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute')

	idx,counts = uniquify(tabulation.astype(str))
	bonds_catalog = tabulation[idx]

	start_time = time.time()
	#---preallocate bond counts per frame
	counts_per_frame = np.zeros((len(valid_frames),len(idx)))
	#---hash the binds over the indices
	bonds_to_idx = dict([(tuple(b),bb) for bb,b in enumerate(bonds_catalog)])
	frame_lims = np.concatenate(([0],np.cumsum(obs_by_frames)))
	for fr,i in enumerate(frame_lims[:-1]):
		status('counting observations per frame',i=fr,looplen=len(valid_frames),
			tag='compute',start=start_time)
		obs_this = tabulation[frame_lims[fr]:frame_lims[fr+1]]
		counts_per_frame[fr][np.array([bonds_to_idx[tuple(o)] for o in obs_this])] += 1
	status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute')
	status('done heavy lifting',tag='compute')
	#---note the size of the outgoing data. we could shrink this by discarding atom names
	status('observation array for cutoff %.1f is %.1fMB'%(
		distance_cutoff,sys.getsizeof(counts_per_frame)/10**6.),tag='note')

	#---package the dataset
	result,attrs = {},{}
	#---everything is indexed by idx
	result['bonds'] = bonds_catalog
	result['observations'] = counts_per_frame
	result['valid_frames'] = valid_frames
	result['nframes'] = np.array(nframes)
	result['resnames'] = resnames_master
	result['subject_residues_resnames'] = subject.residues.resnames
	result['targets_residues_resnames'] = targets.residues.resnames
	result['subject_residues_resids'] = subject.residues.resids
	result['nmols'] = rescounts
	result['times'] = np.array(times)

	#---some basic post-processing common to many of the plots
	global bonds,obs
	bonds,obs = bonds_catalog,counts_per_frame
	#---post: generate timewise trajectories for the number of contacts between protein residues and lipids
	#---methodology note: in a basic version of this calculation we simply count all of the bonds between 
	#---...any lipid-protein residue pair. this means that being more close to a lipid might result in more 
	#---...contacts and hence generates a higher score. hence we have two versions of the calculation. one 
	#---...counts the total number of contacts, and the other discards atom information and scores contacts 
	#---...with a maximum of one per protein residue-lipid pair. this calculation does both
	#---! need to check for atom-name resolution otherwise this is moot
	resids = result['subject_residues_resids']
	lipid_resnames = np.unique(bonds[:,rowspec.index('target_resname')])
	resname_combos = [(r,np.array([r])) for r in lipid_resnames]+[('all lipids',np.array(lipid_resnames))]
	#---compute loop
	looper = [{'resid':resid,'resname_set':resname_set} 
		for resid in resids for resname_name,resname_set in resname_combos]
	compute_function = count_reduced_contact
	incoming = basic_compute_loop(compute_function,looper,run_parallel=run_parallel)
	#---package this as a list of resid/resname pairs and the counts for them
	result['pairs_resid_resname'] = np.array([(resid,resname_name) 
		for resid in resids for resname_name,resname_set in resname_combos]).astype(str)
	result['counts_resid_resname'] = np.array(incoming)
	#---reduce the data for the modified count described above
	global bonds_red
	bonds_red = bonds[:,np.array([0,1,3,4])]
	compute_function = count_reduced_contact_reduced
	incoming = basic_compute_loop(compute_function,looper,run_parallel=run_parallel)
	result['counts_resid_resname_singleton'] = np.array(incoming)

	#---debugging the explicit method used in the function above
	if False:
		resid,resname_set = 2,['POP2']
		which = np.where(np.all((bonds[:,rowspec.index('subject_resid')].astype(int)==resid,np.in1d(bonds[:,rowspec.index('target_resname')],resname_set)),axis=0))
		obs.T[which].sum(axis=0)
		bonds[which]
	#---debugging the reduced method used in the function above
	if False:
		resid,resname_set = 2,['POP2']
		which = np.where(np.all((bonds_red[:,rowspec_red.index('subject_resid')].astype(int)==resid,np.in1d(bonds_red[:,rowspec_red.index('target_resname')],resname_set)),axis=0))
		(obs.T[which].sum(axis=0)>0)*1
		bonds_red[which]
		#---! added this after discovering a contradiction in the results
		idx,counts = uniquify(bonds_red[which].astype(str))
		bonds_red[which][idx]
		obs.T[which][idx].sum(axis=0)

	status('compute job lasted %.1fmin'%((time.time()-start_job_time)/60.),tag='time')
	return result,attrs
Ejemplo n.º 9
0
def hydrogen_bonding(grofile,trajfile,**kwargs):

	"""
	Generic hydrogen bonding code.
	Revamped on 2017.4.28 to generate a more uniform data structure.
	"""

	#---unpack
	sn = kwargs['sn']
	work = kwargs['workspace']
	calc = kwargs['calc']
	debug = kwargs.get('debug',False)
	run_parallel = kwargs.get('run_parallel',True)

	#---prototyping an external module import to replace the sometimes tedious addition of 
	#---... many metadata to the meta dictionary in your YAML files.
	protein_itp_loader = work.vars.get('protein_itp_loader',None)
	#---default ITP paths are set for each simulation in the metadata
	if protein_itp_loader==None: 
		def protein_itp_loader(sn,**kwargs):
			itp = work.meta.get(sn,{}).get('protein_itp',None)
			if not itp: raise Exception('cannot find protein_itp in meta dictionary for %s '%sn+
				'note that you can also use the protein_itp_loader functionality to get the ITP file')
	#---custom ITP loader specified as an alternate_module
	else: protein_itp_loader = alternate_module(**protein_itp_loader)

	#---settings
	distance_cutoff,angle_cutoff = [calc['specs'][i] for i in ['distance_cutoff','angle_cutoff']]
	#---cutoff for inferring hydrogens from a one-time distance search
	distance_h_cutoff = distance_cutoff

	#---prepare universe	
	uni = MDAnalysis.Universe(grofile,trajfile)
	nframes = len(uni.trajectory)
	lenscale = 10.
	start_job_time = time.time()

	#---save topology for later
	_,idx,counts = np.unique(uni.residues.resnames,return_index=True,return_counts=True)
	resnames = uni.residues.resnames[np.sort(idx)]
	resnames_master = np.array(resnames)
	rescounts = counts[np.argsort(idx)]

	import makeface
	#---get an automacs landscape with a little help from the user
	try: mod = makeface.import_remote('amx/amx')
	except: raise Exception('please clone a copy of automacs next to omni in `amx`. '
		'you must also run `make setup all` from that directory to get force field files.')
	mod['state'].force_field = 'charmm'
	Landscape = mod['Landscape']
	land = Landscape(cwd='amx/')
	#---use the landscape to get hydrogen bond donors and acceptors for lipids
	hydrogen_bond_ref = {}
	#---this section relies on correct definitions from the Landscape
	targets = land.objects_by_category('lipid')
	#---METHODOLOGY NOTE: we catalog all hydrogen bonding opportunities ONLY BY NAME
	#---loop over lipid targets and scan them for hydrogen bond opportunities
	for resname in targets:
		#---each lipid ITP has an identical molecule with the same (residue) name
		itp = mod['GMXTopology'](land.objects[resname]['fn'])
		#---donor names come from a double-regex match over bonds
		donor_names = itp.get_bonds_by_regex(molname=resname,patterns=['^H','^(N|O|S)'])
		#---acceptor names have a single regex
		#---!!! check that this is the correct definition
		acceptor_names = [i['atom'] for i in 
			itp.molecules[resname]['atoms'] if re.match('^(N|O|S)',i['atom'])]
		hydrogen_bond_ref[resname] = {'acceptors':acceptor_names,'donors':donor_names}
	#---include any proteins as participants in the bonding
	if kwargs['calc']['specs'].get('protein',False):
		#---get the protein ITP from metadata
		itp_fn = work.meta[sn].get('protein_itp',protein_itp_loader(sn,work=work))
		if not itp_fn: raise Exception('add protein_itp to the meta for %s'%sn)
		#---get the sims spot path systematically
		if os.path.isfile(eval(itp_fn)): itp_fn_abs = eval(itp_fn)
		#---if path is relative then we consult the spots
		else:
			rootdir = work.raw.spots[(work.raw.spotname_lookup(sn),'structure')]['rootdir']
			sn_dir = os.path.join(rootdir,sn)
			#---user supplies step folder and path to the reference structure
			itp_fn_abs = os.path.join(sn_dir,itp_fn)
		protein_itp = mod['GMXTopology'](itp_fn_abs)
		for molname in protein_itp.molecules:
			#---mimic the procedure above for lipids
			#---donor names come from a double-regex match over bonds
			donor_resnames_names = protein_itp.get_bonds_by_regex(molname=molname,
				patterns=['^H','^(N|O|S)'],include_resname=True)
			#---organize hydrogen bonds by residue name
			resnames_all = list(set([i for j in zip(*donor_resnames_names)[0] for i in j]))
			for resname_focus in resnames_all:
				donor_list = []
				#---loop over resnames within the protein
				for resnames,names in donor_resnames_names:
					if resnames[0]!=resnames[1]: 
						raise Exception('invalid hydrogen bond spec %s,%s'%(resnames,names))
					elif resnames[0]==resname_focus: donor_list.append(names)
					else: continue
				#---acceptor names have a single regex
				#---!!! check that this is the correct definition
				acceptor_names = list(set([i['atom'] for i in 
					protein_itp.molecules[molname]['atoms'] if re.match('^(N|O|S)',i['atom'])
					and i['resname']==resname_focus]))
				hydrogen_bond_ref[(molname,resname_focus)] = {
					'acceptors':acceptor_names,'donors':donor_list}

	"""
	developing a new method for selecting the atoms correctly
	we need to get all possible donors into a big selection after which case the hbonds.hbonder_framewise
		does the rest and the plotting codes are decent at picking out inter-residue bonds and identities 
	consider the customer: hbonds_framewise 
		needs a list of donors, hydrogens, and acceptors
		since the donors and hydrogens must be related by bonds
			there is some redundancy in the donor list
			which redundancy hbonds_framewise accounts for
	construct the donors list
		start with the list of all hydrogens (somewhat counterintuitive)
		consult the hydrogen_bond_ref and loop over all residues and then grow a list of hydrogen indices
		for each hydrogen find the associated heavy atom and and add both indices to separate lists
		net result is two lists of indices over the hydrogens and donors which constitute a bond
	"""

	#---get the heavy atom side of the donors
	donors_heavy,donors_h = [uni.select_atoms(' or '.join(['(resname %s and (%s))'%(
		resname if type(resname) in str_types else resname[1],' or '.join(['name %s'%i[w] 
		for i in v['donors']])) for resname,v in hydrogen_bond_ref.items() 
		if v['donors']])) for w in range(2)]
	acceptors_heavy = uni.select_atoms(' or '.join(['(resname %s and (%s))'%(
		resname if type(resname) in str_types else resname[1],' or '.join(['name %s'%i 
		for i in v['acceptors']])) for resname,v in hydrogen_bond_ref.items() 
		if v['acceptors']]))

	#---check non-redundany residues
	if not len(donors_heavy.residues)==len(np.unique(donors_heavy.resids)):
		raise Exception('residue redundancy in the donor heavy list')
	if not len(donors_h.residues)==len(np.unique(donors_h.resids)):
		raise Exception('residue redundancy in the donor hydrogen list')
	#---constructing the donors side selection to preserve the bond relation
	donors_reindex = []
	for refkey,details in hydrogen_bond_ref.items():
		#---protein residues have the protein molecule name alongside
		resname = refkey if type(refkey) in str_types else refkey[1]
		for heavy,light in details['donors']:
			inds_heavy = np.where(np.all((
				donors_heavy.resnames==resname,donors_heavy.names==heavy),axis=0))[0]
			inds_light = np.where(np.all((
				donors_h.resnames==resname,donors_h.names==light),axis=0))[0]
			#---loop over resids and for each resid that has them, we add the indices to the list
			#---! these descending loops are clumsy but they should be fast and they make definitions precise
			for resid in np.unique(np.concatenate((donors_heavy[inds_heavy].resids,
				donors_h[inds_light].resids))):
				inds_heavy = np.where(np.all((
					donors_heavy.resnames==resname,donors_heavy.names==heavy,donors_heavy.resnums==resid
					),axis=0))[0]
				inds_light = np.where(np.all((
					donors_h.resnames==resname,donors_h.names==light,donors_h.resnums==resid),axis=0))[0]
				if len(inds_heavy)>1 or len(inds_light)>1: 
					raise Exception('serious error! one unique hydrogen bond in a single residue')
				if len(inds_heavy)==1 and len(inds_light)==1:
					donors_reindex.append((inds_heavy[0],inds_light[0]))
	#---the reindexed donors preserved the bond relation and covers all possible unique hydrogen bonds
	donors_reindex = np.array(donors_reindex)

	#---prepare coordinates for each frame
	st = time.time()
	vecs,all_donor_coords,all_acceptor_coords,all_h_coords = [],[],[],[]
	#---purposefully profligate with the memory so this goes quickly
	for fr in range(nframes):
		status('caching coordinates',tag='compute',i=fr,looplen=nframes,start=st)	
		uni.trajectory[fr]
		vecs.append(uni.dimensions[:3]/lenscale)
		all_donor_coords.append(donors_heavy.positions[donors_reindex[:,0]]/lenscale)
		all_h_coords.append(donors_h.positions[donors_reindex[:,1]]/lenscale)
		all_acceptor_coords.append(acceptors_heavy.positions/lenscale)
	status('completed caching in %.1f minutes'%((time.time()-st)/60.),tag='status')

	#---export variables
	from codes import hbonds_framewise
	hbonds_framewise.hydrogen_bond_ref = hydrogen_bond_ref
	hbonds_framewise.all_donor_coords = all_donor_coords
	hbonds_framewise.all_acceptor_coords = all_acceptor_coords
	hbonds_framewise.all_h_coords = all_h_coords
	hbonds_framewise.vecs = vecs
		
	#---debug
	if debug:
		hbonds.donors_side = donors_side
		hbonds.donors_inds = donors_inds
		hbonds.donors_inds = donors_inds
		hbonds.acceptors_side = acceptors_side
		fr = 686 #---careful debugging at this frame
		incoming = hbonds_framewise.hbonder_framewise(fr,distance_cutoff=distance_cutoff,angle_cutoff=angle_cutoff)
		sys.quit()

	start = time.time()
	out_args = {'distance_cutoff':distance_cutoff,'angle_cutoff':angle_cutoff}
	if run_parallel:
		incoming = Parallel(n_jobs=8,verbose=10 if debug else 0)(
			delayed(hbonds_framewise.hbonder_framewise,has_shareable_memory)(fr,**out_args) 
			for fr in framelooper(nframes,start=start))
	else: 
		incoming = []
		for fr in framelooper(nframes):
			incoming.append(hbonds_framewise.hbonder_framewise(fr,**out_args))

	#---get valid frames
	valid_frames = np.where([len(i['donors'])>0 for i in incoming])[0]
	#---concatenate the donor/acceptor indices across all frames
	donor_cat,donor_cat_h = [np.concatenate([donors_reindex.T[j][incoming[i]['donors']] 
		for i in valid_frames]).astype(int) for j in range(2)]
	acceptor_cat = np.concatenate([incoming[i]['acceptors'] for i in valid_frames]).astype(int)
	obs_by_frames = np.array([len(incoming[i]['acceptors']) for i in valid_frames]).astype(int)

	start_time = time.time()
	#---tabulate each bond observation
	status('sluggish sequence because there are {:,} bond observations'.format(len(donor_cat)),tag='warning')
	status('tabulating all distinct hydrogen bonds',tag='compute')
	tabulation = np.transpose((donors_heavy.resnames[donor_cat],donors_heavy.resids[donor_cat],
		donors_heavy.names[donor_cat],acceptors_heavy.resnames[acceptor_cat],
		acceptors_heavy.resids[acceptor_cat],acceptors_heavy.names[acceptor_cat],
		#---include the hydrogen identity here in the tabulation (note this might make things larger?)
		#---also note that the hydrogen atom name should be enough because we already have the donor resid
		donors_h.names[donor_cat_h],))
	status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute')

	#---reduce tabulation by discarding all SOL-SOL bonds
	#---...note that this is necessary because we have 33M observations and almost all of them are "unique"
	#---...precisely because so many of them involve water
	#---actually, instead of discarding, let us change all waters to a single residue
	tabulation_explicit = tabulation
	tabulation = np.array(tabulation_explicit)
	for p in [0,3]:
		sols = np.where(tabulation[:,p]=='SOL')[0]
		tabulation[(sols,(np.ones((len(sols)))*(p+1)).astype(int))] = '1'

	start_time = time.time()
	status('unique-ifying the tabulated bonds (estimated %ds)'%(len(donor_cat)*1.3*10**-6),tag='compute')
	status('note: with 32GB memory, 33M observations works fine, but 46M hits the swap',tag='warning')
	#---note that unique is getting "axis" in np 1.13 but at some point on or before 1.12 they added some 
	#---...kind of a safety check on the following trick for unique rows, which check returns an error
	#---...message: "TypeError: Cannot change data-type for object array." which is solved by forcing 
	#---...the object to a string type. note that this method requires void and not a blank string, which
	#---...some examples will use. this changed must have happened in the <1 week since we wrote 
	#---...the hydrogen bonds code and tested it again on the factory
	#---uniquify the enormous list of all possible hydrogen bonds
	tabulation_reform = tabulation.astype(str)
	tabulation_unique = np.ascontiguousarray(tabulation_reform).view(
		np.dtype((np.void,tabulation_reform.dtype.itemsize*tabulation_reform.shape[1])))
	tabulation_view_unique,idx,counts = np.unique(tabulation_unique,return_index=True,return_counts=True)
	bonds = tabulation[idx]
	status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute')

	start_time = time.time()
	#---preallocate bond counts per frame
	counts_per_frame = np.zeros((len(valid_frames),len(idx)))
	#---hash the binds over the indices
	bonds_to_idx = dict([(tuple(b),bb) for bb,b in enumerate(bonds)])
	frame_lims = np.concatenate(([0],np.cumsum(obs_by_frames)))
	for fr,i in enumerate(frame_lims[:-1]):
		status('counting observations',i=fr,looplen=len(valid_frames),
			tag='compute',start=start_time)
		obs = tabulation[frame_lims[fr]:frame_lims[fr+1]]
		counts_per_frame[fr][np.array([bonds_to_idx[tuple(o)] for o in obs])] += 1
	status('stopwatch: %.1fs'%(time.time()-start_time),tag='compute')
	status('done heavy lifting',tag='compute')

	#---package the dataset
	result,attrs = {},{}
	#---everything is indexed by idx
	result['bonds'] = bonds
	result['observations'] = counts_per_frame
	result['valid_frames'] = valid_frames
	result['nframes'] = np.array(nframes)
	result['resnames'] = resnames_master
	result['nmols'] = rescounts
	status('compute job lasted %.1fmin'%((time.time()-start_job_time)/60.),tag='time')
	return result,attrs