def generate_peak_list(self, pdb_code, peak_pdb_hier, model_id, set_chain=False, renumber=False): #this function takes a list of peaks from a peak search as a pdb #and outputs a list of dictionaries with info and coordinates #if chainid is False, original chainids are preserved pput = Util() peak_list = [] pdb = peak_pdb_hier for model in pdb.models(): for chain in model.chains(): ori_chain = chain.id.strip() if set_chain: out_chain = set_chain else: out_chain = ori_chain for resgroups in chain.residue_groups(): for atomgroups in resgroups.atom_groups(): for atom in atomgroups.atoms(): awl = atom.fetch_labels() resname = awl.resname.strip() name = awl.name.strip() altloc = awl.altloc.strip() ori_resid = resgroups.resseq.strip() if renumber: out_resid = str(len(peak_list) + 1) else: out_resid = ori_resid coord = atom.xyz resat = resname + "_" + ori_chain + ori_resid + "_" + name db_id = pput.gen_db_id(pdb_code, out_chain, out_resid) unat, unal, unrg = pput.gen_unids(awl, model=model_id) pdict = self.gen_pdict() pdict["db_id"] = db_id pdict["model"] = model_id pdict["resid"] = out_resid pdict["chainid"] = out_chain pdict["coord"] = coord pdict["unat"] = unat pdict["unal"] = unal pdict["unrg"] = unrg pdict["ori_chain"] = ori_chain pdict["ori_resid"] = ori_resid pdict["resat"] = resat peak_list.append(pdict) return peak_list
def __init__(self, pdb_code, unid, symmetry, orig_pdb_hier, strip_pdb_hier, peak_pdb_hier, struct_data, chainid, resid, coord, bound): #instantiate utility classes self.pput = Util() self.ppctx = CctbxHelpers() #bind some util functions here self.write_atom = self.pput.write_atom #attach references to structure data for use in this class self.pdb_code = struct_data.pdb_code self.orig_symmetry = struct_data.orig_symmetry self.orig_pdb_hier = struct_data.orig_pdb_hier self.orig_xrs = struct_data.orig_xrs self.strip_pdb_hier = strip_pdb_hier self.peak_pdb_hier = struct_data.peak_pdb_hier self.struct_data = struct_data self.chainid = chainid #single letter string self.resid = int(resid) self.coord = coord #tuple of floats self.bound = bound self.grid_last = int(self.bound * 4 + 1) self.unid = unid #copy pdb,hier,xrs in standard settings self.so4_pdb = copy.deepcopy(self.struct_data.std_so4_pdb) self.symmetry = self.so4_pdb.crystal_symmetry() self.so4_hier = copy.deepcopy(self.struct_data.std_so4_hier) self.so4_xrs = copy.deepcopy(self.struct_data.std_so4_xrs) self.wat_pdb = copy.deepcopy(self.struct_data.std_wat_pdb) self.wat_hier = copy.deepcopy(self.struct_data.std_wat_hier) self.wat_xrs = copy.deepcopy(self.struct_data.std_wat_xrs) #make local maps self.local_map_fofc, self.peak_volume_fofc = self.make_local_map( self.struct_data.fofc_map_data) self.local_map_2fofc, self.peak_volume_2fofc = self.make_local_map( self.struct_data.twofofc_map_data) self.shaped_map_fofc = self.make_shaped_map(self.local_map_fofc) self.shaped_map_2fofc = self.make_shaped_map(self.local_map_2fofc) self.inv_map_fofc = self.make_round_map(self.local_map_fofc, 2.0, True) self.inv_map_2fofc = self.make_round_map(self.local_map_2fofc, 2.0, True) #set peak heights of initial peak self.peak_fofc = self.density_at_point(self.struct_data.fofc_map_data, self.orig_xrs, self.coord) self.peak_2fofc = self.density_at_point( self.struct_data.twofofc_map_data, self.orig_xrs, self.coord)
def kde_score(self,master_array,all_peak_db): #master_array = np.sort(master_array,order=['id']) ppio = PPio() pput = PPutil() mdict = ppio.read_master_dict() ppkde = PPKDE(mdict,verbose=True) kde_probs = ppkde.kde_score(master_array) master_array['kde'] = kde_probs master_array['lab'] = ppkde.kde_label(master_array) for pind,drow in enumerate(master_array): unal = drow['unal'] pdict = all_peak_db[unal] pick = pdict['pick'] if pick == 0: pick = pput.pick_from_prob(drow['kde']) master_array['pick'][pind] = pick
def basic_features(self, features, peak_object): #add data on rotations,peak heights, volumes, local environment, etc. pput = Util() features['resid'] = peak_object.resid features['chainid'] = peak_object.chainid features['coord'] = peak_object.coord features['vol_fofc'] = peak_object.peak_volume_fofc features['vol_2fofc'] = peak_object.peak_volume_2fofc features['fofc_sig_in'] = peak_object.peak_fofc features['2fofc_sig_in'] = peak_object.peak_2fofc features['fofc_sig_out'] = peak_object.density_at_point( peak_object.local_map_fofc, features['wat_fofc_ref_xrs'], features['wat_fofc_coord_out']) features['2fofc_sig_out'] = peak_object.density_at_point( peak_object.local_map_2fofc, features['wat_fofc_ref_xrs'], features['wat_fofc_coord_out']) #rescale density level to account for lack of variance in solvent region features['2fofc_sigo_scaled'] = pput.scale_density( features['2fofc_sig_out'], features['solc']) features['fofc_sigo_scaled'] = pput.scale_density( features['fofc_sig_out'], features['solc']) new_coord = features['wat_fofc_coord_out'] features['dmove'] = np.linalg.norm( np.subtract(new_coord, (5.0, 5.0, 5.0)))
def __init__(self, master_dictionary, train=False, verbose=False): self.verbose = verbose self.ppio = DataIO(phenix_python=False) self.pput = Util() self.ppfilt = Filters(verbose=verbose) self.ppstat = StatFunc() mdict = master_dictionary if train == False: self.kdedict = mdict['kde'] self.populations = self.kdedict['populations'] self.features = self.kdedict['features'] else: self.populations = ['HOH', 'SO4', 'OTH', 'ML1'] self.features = ['score', 'cscore'] self.flat_prior = np.ones(len(self.populations), dtype=np.float32) / len(self.populations) #expected value of MLE of alpha coefficients of dirichlet distribution #trained against probabilities from +1 smoothed counts from entire PDB self.dir_prior = np.array( [0.8516044, 0.04814615, 0.05036076, 0.04988869])
def get_contacts(self, s_atom, cpairs, cutoff=6.0): pput = Util() s_resname = s_atom.resname.strip() s_chain = str(s_atom.chain().id).strip() s_model_id = s_atom.model_id.strip() s_element = s_atom.element.strip() s_name = s_atom.name.strip() s_altloc = s_atom.altloc.strip() s_resid = s_atom.resseq.strip() s_coord = s_atom.xyz s_unat, s_unal, s_unrg = pput.gen_unids(s_atom) s_resat = s_resname + "_" + s_chain + s_resid + "_" + s_name contacts = [] for cpair in cpairs: distance = cpair[1] #contact_atom c_atom = cpair[0] c_sym = int(cpair[2]) resname = c_atom.resname.strip() chain = str(c_atom.chain().id).strip() model_id = c_atom.model_id.strip() element = c_atom.element.strip() name = c_atom.name.strip() altloc = c_atom.altloc.strip() resid = c_atom.resseq.strip() coord = c_atom.xyz unat, unal, unrg = pput.gen_unids(c_atom) resat = resname + "_" + chain + resid + "_" + name if s_model_id == "": s_model_id = 5 if model_id == "": model_id = 5 special = False ctype = "unknown" if s_unat == unat: ctype = "self" if distance < 1.8: special = True elif int(model_id) == int(s_model_id): ctype = "intra" else: ctype = "inter" #everything goes into a dictionary (some converted to int) contact = { "name": name, "chain": chain, "element": element, "distance": distance, "coord": coord, "resname": resname, "altloc": altloc, "resid": int(resid), "model": int(model_id), "special": special, "unat": unat, "unal": unal, "unrg": unrg, "s_name": s_name, "s_chain": s_chain, "s_element": s_element, "s_coord": s_coord, "s_resname": s_resname, "s_altloc": s_altloc, "s_resid": int(s_resid), "s_model": int(s_model_id), "ctype": ctype, "s_unat": s_unat, "s_unal": s_unal, "s_unrg": s_unrg, "s_resat": s_resat, "resat": resat, "sym": c_sym } contacts.append(contact) contacts.sort(key=lambda x: x['distance']) return contacts, s_unal
def __init__(self): self.pput = Util()
class CctbxHelpers: """ Class of PProbe functions that dig deep into cctbx Watch our for later scipy conflicts """ def __init__(self): self.pput = Util() def do_cprofile(func): def profiled_func(*args, **kwargs): profile = cProfile.Profile() try: profile.enable() result = func(*args, **kwargs) profile.disable() return result finally: profile.print_stats() return profiled_func def write_local_map(self, input_map_data, filename_base, peak_object): #writes out map in std setting last_grid = peak_object.grid_last write_ccp4_map(file_name=filename_base + ".ccp4", unit_cell=peak_object.so4_xrs.unit_cell(), space_group=sgtbx.space_group_info("P1").group(), gridding_first=(0, 0, 0), gridding_last=(last_grid, last_grid, last_grid), map_data=input_map_data, labels=flex.std_string([ "local_map", ])) def renumber_residue(self, pdb_hier, num): #sets resid for all atoms in hier for model in pdb_hier.models(): for chain in model.chains(): for rg in chain.residue_groups(): rg.resseq = num #@do_cprofile def contacts_to_all(self, pdb_hier, symmetry, cutoff=6.0): #finds all contacts within cutoff in an entire pdb_hier xrs = pdb_hier.extract_xray_structure(crystal_symmetry=symmetry) asu_mappings = xrs.asu_mappings(buffer_thickness=cutoff + 1.0) pair_generator = crystal.neighbors_fast_pair_generator( asu_mappings, distance_cutoff=cutoff) natoms = pdb_hier.atoms().size() #distance matrix impractical as we may have multiple contacts to the same atom by symmetry #all of which need to be preserved all_cont = {} for i in range(natoms): all_cont[i] = [] for pair in pair_generator: pi = pair.i_seq pj = pair.j_seq ps = pair.j_sym pd = int(np.sqrt(pair.dist_sq) * 1000) / 1000.0 #avoid base2 float issues for uniquifying all_cont[pi].append( (pj, pd, ps)) #need both from and to (across asu) all_cont[pj].append((pi, pd, ps)) ncont = 0 awl_db = {} # refs to awl objects to pass for awl in pdb_hier.atoms_with_labels(): awl_db[awl.i_seq] = awl all_cont_db = {} for iseq, clist in all_cont.iteritems(): #print "PROTO",iseq,clist ncont = ncont + len(clist) source_at = iseq source_awl = awl_db[source_at] uni_c = set(clist) # for each source awl, generate list of unique tuples (cont awl,distance,sym) cpairs = list( (awl_db[c_at[0]], c_at[1], c_at[2]) for c_at in uni_c) contacts, s_unique_id = self.get_contacts(source_awl, cpairs, cutoff=cutoff) all_cont_db[s_unique_id] = contacts print " Found %d contact pairs" % ncont return all_cont_db #@do_cprofile def contacts_to_coord(self, coord, pdb_hier, symmetry, cutoff=6.0): """ Used to use extract map_model, but that caused problems This function takes cartesian coordinate, pdb_hier, and symmetry and returns sorted list of dictionaries, each a particular contact very hacked together, jams pdb strings together create a dummy pdb, put water at the peak site in the original coordinate system then use fast pair generater to get all contacts to our peak atom """ dummy_atom = self.pput.write_atom(1, "O", "", "HOH", "ZZ", 9999, "", coord[0], coord[1], coord[2], 1.0, 35.0, "O", "") #hack to add an atom to a pdb orig_str = pdb_hier.as_pdb_string(write_scale_records=False, append_end=False, interleaved_conf=0, atoms_reset_serial_first_value=1, atom_hetatm=True, sigatm=False, anisou=False, siguij=False, output_break_records=False) nmodels = len(pdb_hier.models()) if nmodels == 1: comb = orig_str + dummy_atom else: newmodel = nmodels + 1 comb = orig_str + "MODEL %s\n" % newmodel comb = comb + dummy_atom comb = comb + "ENDMDL\n" dummy_pdb = iotbx.pdb.input(source_info=None, lines=flex.split_lines(comb)) dummy_hier = dummy_pdb.construct_hierarchy() atsel = "chain ZZ and resid 9999" dummy_select = dummy_hier.atom_selection_cache().selection( string=atsel) atsel_index = np.argwhere(dummy_select.as_numpy_array()) dummy_xrs = dummy_pdb.xray_structure_simple(crystal_symmetry=symmetry) #find neighbors with symmetry -- use pair_finding with asu_mappings #doing this for each peak may be a bad idea, use all_contacts version instead if possible asu_mappings = dummy_xrs.asu_mappings(buffer_thickness=cutoff) pair_generator = crystal.neighbors_fast_pair_generator( asu_mappings, distance_cutoff=cutoff) peak_vector_list = [] #neighbor_mask = pair_generator.neighbors_of(atsel_arr) #neighbors = pair_generator.select(neighbor_mask) for pair in pair_generator: if pair.i_seq == atsel_index: #our peak is first atom, but we want pairs in both directions #to provide exhaustive list of contacts #store index number and difference vector and sym to make unique #we don't care which symop, just how far away rdist = int( np.sqrt(pair.dist_sq) * 1000) / 1000.0 #avoid float error peak_vector_list.append((pair.j_seq, rdist, pair.j_sym)) if pair.j_seq == atsel_index: rdist = int(np.sqrt(pair.dist_sq) * 1000) / 1000.0 peak_vector_list.append((pair.i_seq, rdist, pair.j_sym)) unique = set(peak_vector_list) dummy_atoms = dummy_hier.atoms() #selection is boolean flex array, sizeof no atoms, can be indexed directly #get awl for our dummy atom s_awl = dummy_atoms.select(dummy_select)[0].fetch_labels() #list((awl_db[c_at[0]],c_at[1],c_at[2]) for c_at in uni_c ) #next, get list of contacts (awl,dist) selection = dummy_hier.atom_selection_cache().selection("not all") if len(unique) == 0: return [] cont_list = [] for conti in unique: selection[conti[0]] = True sel_awl = dummy_atoms.select(selection)[0].fetch_labels() cont_list.append( (sel_awl, conti[1], conti[2])) #pass source awl, index for target, distance selection[conti[0]] = False #unselect contacts, s_unal = self.get_contacts(s_awl, cont_list, cutoff=cutoff) return contacts def merge_hier(self, hier_list, symmetry): pdb2str = lambda hier: hier.as_pdb_string(write_scale_records=False, append_end=False, interleaved_conf=0, atom_hetatm=True, sigatm=False, anisou=False, siguij=False, output_break_records=False) allpdb = "" for index, hier in enumerate(hier_list): allpdb = allpdb + "MODEL %s\n" % str(index + 1) allpdb = allpdb + pdb2str(hier) allpdb = allpdb + "ENDMDL\n" dummy_pdb = iotbx.pdb.input(source_info=None, lines=flex.split_lines(allpdb)) dummy_hier = dummy_pdb.construct_hierarchy() dummy_hier.remove_hd() dummy_hier.atoms_reset_serial() #dummy_hier.write_pdb_file('merge.pdb') return dummy_hier def get_contacts(self, s_atom, cpairs, cutoff=6.0): pput = Util() s_resname = s_atom.resname.strip() s_chain = str(s_atom.chain().id).strip() s_model_id = s_atom.model_id.strip() s_element = s_atom.element.strip() s_name = s_atom.name.strip() s_altloc = s_atom.altloc.strip() s_resid = s_atom.resseq.strip() s_coord = s_atom.xyz s_unat, s_unal, s_unrg = pput.gen_unids(s_atom) s_resat = s_resname + "_" + s_chain + s_resid + "_" + s_name contacts = [] for cpair in cpairs: distance = cpair[1] #contact_atom c_atom = cpair[0] c_sym = int(cpair[2]) resname = c_atom.resname.strip() chain = str(c_atom.chain().id).strip() model_id = c_atom.model_id.strip() element = c_atom.element.strip() name = c_atom.name.strip() altloc = c_atom.altloc.strip() resid = c_atom.resseq.strip() coord = c_atom.xyz unat, unal, unrg = pput.gen_unids(c_atom) resat = resname + "_" + chain + resid + "_" + name if s_model_id == "": s_model_id = 5 if model_id == "": model_id = 5 special = False ctype = "unknown" if s_unat == unat: ctype = "self" if distance < 1.8: special = True elif int(model_id) == int(s_model_id): ctype = "intra" else: ctype = "inter" #everything goes into a dictionary (some converted to int) contact = { "name": name, "chain": chain, "element": element, "distance": distance, "coord": coord, "resname": resname, "altloc": altloc, "resid": int(resid), "model": int(model_id), "special": special, "unat": unat, "unal": unal, "unrg": unrg, "s_name": s_name, "s_chain": s_chain, "s_element": s_element, "s_coord": s_coord, "s_resname": s_resname, "s_altloc": s_altloc, "s_resid": int(s_resid), "s_model": int(s_model_id), "ctype": ctype, "s_unat": s_unat, "s_unal": s_unal, "s_unrg": s_unrg, "s_resat": s_resat, "resat": resat, "sym": c_sym } contacts.append(contact) contacts.sort(key=lambda x: x['distance']) return contacts, s_unal
class PeakObj: """ A class for a "peak" object with all associated data for a particular peak This class gets instantiated for every peak, and can be a bit demanding for cpu and memory Constructor takes the following: StructData object that contains pdb_code, all pdbs/xrs/maps Peak Specific info: chainid,coord,bound, option to specify stripped pdb Initialization does the following: 1)makes local maps on the standard 0.5A grid around the peak 2)makes versions of these maps (shaped, round, etc.) """ def __init__(self, pdb_code, unid, symmetry, orig_pdb_hier, strip_pdb_hier, peak_pdb_hier, struct_data, chainid, resid, coord, bound): #instantiate utility classes self.pput = Util() self.ppctx = CctbxHelpers() #bind some util functions here self.write_atom = self.pput.write_atom #attach references to structure data for use in this class self.pdb_code = struct_data.pdb_code self.orig_symmetry = struct_data.orig_symmetry self.orig_pdb_hier = struct_data.orig_pdb_hier self.orig_xrs = struct_data.orig_xrs self.strip_pdb_hier = strip_pdb_hier self.peak_pdb_hier = struct_data.peak_pdb_hier self.struct_data = struct_data self.chainid = chainid #single letter string self.resid = int(resid) self.coord = coord #tuple of floats self.bound = bound self.grid_last = int(self.bound * 4 + 1) self.unid = unid #copy pdb,hier,xrs in standard settings self.so4_pdb = copy.deepcopy(self.struct_data.std_so4_pdb) self.symmetry = self.so4_pdb.crystal_symmetry() self.so4_hier = copy.deepcopy(self.struct_data.std_so4_hier) self.so4_xrs = copy.deepcopy(self.struct_data.std_so4_xrs) self.wat_pdb = copy.deepcopy(self.struct_data.std_wat_pdb) self.wat_hier = copy.deepcopy(self.struct_data.std_wat_hier) self.wat_xrs = copy.deepcopy(self.struct_data.std_wat_xrs) #make local maps self.local_map_fofc, self.peak_volume_fofc = self.make_local_map( self.struct_data.fofc_map_data) self.local_map_2fofc, self.peak_volume_2fofc = self.make_local_map( self.struct_data.twofofc_map_data) self.shaped_map_fofc = self.make_shaped_map(self.local_map_fofc) self.shaped_map_2fofc = self.make_shaped_map(self.local_map_2fofc) self.inv_map_fofc = self.make_round_map(self.local_map_fofc, 2.0, True) self.inv_map_2fofc = self.make_round_map(self.local_map_2fofc, 2.0, True) #set peak heights of initial peak self.peak_fofc = self.density_at_point(self.struct_data.fofc_map_data, self.orig_xrs, self.coord) self.peak_2fofc = self.density_at_point( self.struct_data.twofofc_map_data, self.orig_xrs, self.coord) def density_at_point(self, map_data, xrs, point): site_frac = xrs.unit_cell().fractionalize(site_cart=point) return map_data.tricubic_interpolation(site_frac) def make_local_map(self, map_data, volume_radius=2.0): # first make a real map of the whole original asu #self.new_map_values = flex.double() #populate a list of cartesian sites in original coordinates center = self.coord volume = 0.0 solvent_content = self.struct_data.solvent_content #sanity check, avoid overcorrection? solvent_content = np.clip(solvent_content, 0.2, 0.8) #function to correct for solvent content on sigma values # Thanks Tom T! Assuming near zero variance in solvent region, # setting total map variance to 1.0 implies the following: # 1.0 = (RMSp^2*vP + RMSs^2*vS)**0.5 = (RMSp^2*vP + 0.0^2*vS)**0.5 # s.t. 1.0 = (RMSp^2*vP)**0.5 = (RMSp^2*(1-vS))**0.5, or # RMSp = 1.0/sqrt(1-Vs), i.e. the true RMS in the protein region # is scaled up to account for the lack of variance in the solvent region volume_sig_scale = 1.0 / np.sqrt(1.0 - solvent_content) new_grid_points = self.pput.new_grid(self.coord, self.bound) new_map_values = flex.double() for point in new_grid_points: site_frac = self.orig_xrs.unit_cell().fractionalize( site_cart=point) value_at_point = map_data.tricubic_interpolation(site_frac) new_map_values.append(value_at_point) if (self.calcbond_lengths(center, point) <= volume_radius): #count number of gridpoints above scaled_sigma as a "peak volume" if (value_at_point >= volume_sig_scale): volume += 1.0 #map values are a 1d array, now reshape to our new cell on 0.5A grid new_map_values.reshape( flex.grid(self.grid_last, self.grid_last, self.grid_last)) return new_map_values.as_double(), volume def find_peaks(self, input_map): map_array = input_map.as_numpy_array() max_index = np.array(np.where(map_array == np.amax(map_array))) max_coords = 0.5 * max_index dist = np.sqrt((max_coords[0] - 5.0)**2 + (max_coords[1] - 5.0)**2 + (max_coords[2] - 5.0)**2) return np.amin(dist) def make_shaped_map(self, square_map): #modifies map by a steep falloff function around map center to #kill strong density that may cause so4/water to be pulled away #from it's starting location during refinement #typically a problem if next to a heavy metal site #convert to flex map_mask_double = flex.double( self.struct_data.shaped_mask.astype(np.float64)) map_mask = map_mask_double.as_double() map_copy = copy.deepcopy(square_map) #scalar multiplication of orginal map and mask shaped_map = map_copy * map_mask return shaped_map.as_double() def make_round_map(self, square_map, radius=5.0, invert=False): map_mask_double = flex.double( self.struct_data.round_mask.astype(np.float64)) map_mask = map_mask_double.as_double() map_copy = copy.deepcopy(square_map) #scalar multiplication of orginal map and mask round_map = map_copy * map_mask if invert: #values closte to zero give very huge and problematic results #so here is a function that behaves like an inverse #for values above 0.1, but then plateaus off inv_map_points = flex.double(square_map.size()) dampfunc = lambda x: (1.0 / ((1.0 + np.exp(-100 * (x - 0.1))))) for index, value in enumerate(square_map): if abs(value) > 0.00001: dval = dampfunc(abs(value)) if value > 0.0: inv_map_points[index] = dval / value + 20.0 * (1.0 - dval) else: inv_map_points[index] = -(dval / abs(value) + 20.0 * (1.0 - dval)) else: inv_map_points[index] = 25.0 inv_map_points.reshape( flex.grid(self.grid_last, self.grid_last, self.grid_last)) return inv_map_points * map_mask else: return round_map.as_double() def calcbond_lengths(self, coord1, coord2): return np.linalg.norm(coord2 - coord1)