def calc_principal_vectors(miller_array, log_out): """ Reference: Aimless program Triclinic & Monoclinic: from B_cart Orthorhombic: along a*, b*, c* Trigonal, Hexagonal, Tetragonal: c* and a*b*-plane Cubic: no anisotropy return type: list of (vector, label, in-plane or not) """ n_residues = p_vm_calculator(miller_array, 1, 0).best_guess aniso_scale_and_b = absolute_scaling.ml_aniso_absolute_scaling(miller_array=miller_array, n_residues=n_residues, n_bases=0) if aniso_scale_and_b.eigen_values[0] == 0: print >>log_out, "Error! Cannot determine B_cart" return b_cart = aniso_scale_and_b.b_cart print >>log_out, """ML estimate of overall B_cart: / %5.2f %5.2f %5.2f \\ | %11.2f %5.2f | \\ %17.2f / """ % (b_cart[0], b_cart[3], b_cart[4], b_cart[1], b_cart[5], b_cart[2]) ev = aniso_scale_and_b.eigen_vectors orth_mat = numpy.array(miller_array.unit_cell().orthogonalization_matrix()).reshape(3,3) print >>log_out, "Eigenvalues/vectors:" for i, eg in enumerate(aniso_scale_and_b.eigen_values): v = ev[3*i:3*(i+1)] vs = ", ".join(map(lambda x: "% .4f"%x, v)) vl = axis_label(v, orth_mat) print >>log_out, " %8.3f (%s) %s" % (eg, vs, vl) cs = miller_array.space_group().crystal_system() if cs == "Cubic": print >>log_out, "No anisotropy in this symmetry." return [] elif cs == "Orthorhombic": return ([1.,0.,0.], "a*", False), ([0.,1.,0.], "b*", False), ([0.,0.,1.], "c*", False) elif cs in ("Triclinic", "Monoclinic"): return ((ev[:3], axis_label(ev[:3], orth_mat), False), (ev[3:6], axis_label(ev[3:6], orth_mat), False), (ev[6:], axis_label(ev[6:], orth_mat), False)) else: # in "Tetragonal", "Hexagonal", "Trigonal", "Cubic" return ([0.,0.,1.], "c*", False), ([0.,0.,1.], "a*b*", True)
def do_clustering(self, nproc=1, b_scale=False, use_normalized=False, html_maker=None): self.clusters = {} prefix = os.path.join(self.wdir, "cctable") assert (b_scale, use_normalized).count(True) <= 1 if len(self.arrays) < 2: print "WARNING: less than two data! can't do cc-based clustering" self.clusters[1] = [float("nan"), [0]] return # Absolute scaling using Wilson-B factor if b_scale: from mmtbx.scaling.matthews import p_vm_calculator from mmtbx.scaling.absolute_scaling import ml_iso_absolute_scaling ofs_wilson = open("%s_wilson_scales.dat" % prefix, "w") n_residues = p_vm_calculator(self.arrays.values()[0], 1, 0).best_guess ofs_wilson.write("# guessed n_residues= %d\n" % n_residues) ofs_wilson.write("file wilsonB\n") for f in self.arrays: arr = self.arrays[f] iso_scale_and_b = ml_iso_absolute_scaling(arr, n_residues, 0) wilson_b = iso_scale_and_b.b_wilson ofs_wilson.write("%s %.3f\n" % (f, wilson_b)) if wilson_b > 0: # Ignoring data with B<0? is a bad idea.. but how..? tmp = flex.exp(-2. * wilson_b * arr.unit_cell().d_star_sq(arr.indices()) / 4.) self.arrays[f] = arr.customized_copy(data=arr.data() * tmp, sigmas=arr.sigmas() * tmp) ofs_wilson.close() elif use_normalized: from mmtbx.scaling.absolute_scaling import kernel_normalisation for f in self.arrays: arr = self.arrays[f] normaliser = kernel_normalisation(arr, auto_kernel=True) self.arrays[f] = arr.customized_copy( data=arr.data() / normaliser.normalizer_for_miller_array, sigmas=arr.sigmas() / normaliser.normalizer_for_miller_array) # Prep args = [] for i in xrange(len(self.arrays) - 1): for j in xrange(i + 1, len(self.arrays)): args.append((i, j)) # Calc all CC if self.use_sfdist: worker = lambda x: calc_sfdist(self.arrays.values()[x[0]], self.arrays.values()[x[1]]) else: worker = lambda x: calc_cc(self.arrays.values()[x[0]], self.arrays.values()[x[1]]) results = easy_mp.pool_map(fixed_func=worker, args=args, processes=nproc) # Check NaN and decide which data to remove idx_bad = {} nans = [] cc_data_for_html = [] for (i, j), (cc, nref) in zip(args, results): cc_data_for_html.append((i, j, cc, nref)) if cc == cc: continue idx_bad[i] = idx_bad.get(i, 0) + 1 idx_bad[j] = idx_bad.get(j, 0) + 1 nans.append([i, j]) if html_maker is not None: html_maker.add_cc_clustering_details(cc_data_for_html) idx_bad = idx_bad.items() idx_bad.sort(key=lambda x: x[1]) remove_idxes = set() for idx, badcount in reversed(idx_bad): if len(filter(lambda x: idx in x, nans)) == 0: continue remove_idxes.add(idx) nans = filter(lambda x: idx not in x, nans) if len(nans) == 0: break use_idxes = filter(lambda x: x not in remove_idxes, xrange(len(self.arrays))) # Make table: original index (in file list) -> new index (in matrix) count = 0 org2now = collections.OrderedDict() for i in xrange(len(self.arrays)): if i in remove_idxes: continue org2now[i] = count count += 1 if len(remove_idxes) > 0: open("%s_notused.lst" % prefix, "w").write("\n".join( map(lambda x: self.arrays.keys()[x], remove_idxes))) # Make matrix mat = numpy.zeros(shape=(len(use_idxes), len(use_idxes))) for (i, j), (cc, nref) in zip(args, results): if i in remove_idxes or j in remove_idxes: continue mat[org2now[j], org2now[i]] = cc open("%s.matrix" % prefix, "w").write(" ".join(map(lambda x: "%.4f" % x, mat.flatten()))) ofs = open("%s.dat" % prefix, "w") ofs.write(" i j cc nref\n") for (i, j), (cc, nref) in zip(args, results): ofs.write("%4d %4d %.4f %4d\n" % (i, j, cc, nref)) open("%s_ana.R" % prefix, "w").write("""\ treeToList2 <- function(htree) { # stolen from $CCP4/share/blend/R/blend0.R groups <- list() itree <- dim(htree$merge)[1] for (i in 1:itree) { il <- htree$merge[i,1] ir <- htree$merge[i,2] if (il < 0) lab1 <- htree$labels[-il] if (ir < 0) lab2 <- htree$labels[-ir] if (il > 0) lab1 <- groups[[il]] if (ir > 0) lab2 <- groups[[ir]] lab <- c(lab1,lab2) lab <- as.integer(lab) groups <- c(groups,list(lab)) } return(groups) } cc<-scan("%(prefix)s.matrix") md<-matrix(1-cc, ncol=%(ncol)d, byrow=TRUE) hc <- hclust(as.dist(md),method="ward") pdf("tree.pdf") plot(hc) dev.off() png("tree.png",height=1000,width=1000) plot(hc) dev.off() hc$labels <- c(%(hclabels)s) groups <- treeToList2(hc) cat("ClNumber Nds Clheight IDs\\n",file="./CLUSTERS.txt") for (i in 1:length(groups)) { sorted_groups <- sort(groups[[i]]) linea <- sprintf("%%04d %%4d %%7.3f %%s\\n", i,length(groups[[i]]),hc$height[i], paste(sorted_groups,collapse=" ")) cat(linea, file="./CLUSTERS.txt", append=TRUE) } # reference: http://www.coppelia.io/2014/07/converting-an-r-hclust-object-into-a-d3-js-dendrogram/ library(rjson) HCtoJSON<-function(hc){ labels<-hc$labels merge<-data.frame(hc$merge) for (i in (1:nrow(merge))) { if (merge[i,1]<0 & merge[i,2]<0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(list(name=labels[-merge[i,1]]),list(name=labels[-merge[i,2]])))")))} else if (merge[i,1]>0 & merge[i,2]<0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(node", merge[i,1], ", list(name=labels[-merge[i,2]])))")))} else if (merge[i,1]<0 & merge[i,2]>0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(list(name=labels[-merge[i,1]]), node", merge[i,2],"))")))} else if (merge[i,1]>0 & merge[i,2]>0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(node",merge[i,1] , ", node" , merge[i,2]," ))")))} } eval(parse(text=paste0("JSON<-toJSON(node",nrow(merge), ")"))) return(JSON) } JSON<-HCtoJSON(hc) cat(JSON, file="dendro.json") q(save="yes") """ % dict(prefix=os.path.basename(prefix), ncol=len(self.arrays), hclabels=",".join(map(lambda x: "%d" % (x + 1), org2now.keys())))) call(cmd="Rscript", arg="%s_ana.R" % os.path.basename(prefix), wdir=self.wdir) output = open(os.path.join(self.wdir, "CLUSTERS.txt")).readlines() for l in output[1:]: sp = l.split() clid, clheight, ids = sp[0], sp[2], sp[3:] self.clusters[int(clid)] = [float(clheight), map(int, ids)]
def run(hklin, pdbin, wdir, anisotropy_correction=False): arrays = iotbx.file_reader.any_file(hklin).file_server.miller_arrays i_arrays = filter( lambda x: x.is_xray_intensity_array() and x.anomalous_flag(), arrays) f_arrays = filter( lambda x: x.is_xray_amplitude_array() and x.anomalous_flag(), arrays) if not i_arrays and not f_arrays: print "No anomalous observation data" return if os.path.exists(wdir): print "%s already exists. quiting." % wdir return os.mkdir(wdir) xs = crystal_symmetry_from_any.extract_from(pdbin) sh_out = open(os.path.join(wdir, "run_anode.sh"), "w") sh_out.write("#!/bin/sh\n\n") sh_out.write("shelxc anode <<+ > shelxc.log 2>&1\n") sh_out.write("cell %s\n" % format_unit_cell(xs.unit_cell())) sh_out.write("spag %s\n" % str(xs.space_group_info()).replace(" ", "")) if i_arrays: obs_array = i_arrays[0] infile = "%s.hkl" % os.path.splitext(os.path.basename(hklin))[0] in_opt = "%s" % infile print "Using intensity array:", obs_array.info().label_string() else: obs_array = f_arrays[0] infile = "%s_f.hkl" % os.path.splitext(os.path.basename(hklin))[0] in_opt = "-f %s" % infile print "No intensity arrays. Using amplitude arrays instead:", obs_array.info( ).label_string() sh_out.write("! data from %s : %s\n" % (os.path.abspath(hklin), obs_array.info().label_string())) obs_array.crystal_symmetry().show_summary(sh_out, prefix="! ") check_symm(obs_array.crystal_symmetry(), xs) n_org = obs_array.size() obs_array = obs_array.eliminate_sys_absent() n_sys_abs = n_org - obs_array.size() if n_sys_abs > 0: print " %d systematic absences removed." % n_sys_abs if anisotropy_correction: print "Correcting anisotropy.." n_residues = p_vm_calculator(obs_array, 1, 0).best_guess abss = ml_aniso_absolute_scaling(obs_array, n_residues=n_residues) abss.show() tmp = -2. if i_arrays else -1. b_cart = map(lambda x: x * tmp, abss.b_cart) obs_array = obs_array.apply_debye_waller_factors(b_cart=b_cart) sh_out.write("sad %s\n" % in_opt) iotbx.shelx.hklf.miller_array_export_as_shelx_hklf( obs_array, open(os.path.join(wdir, infile), "w"), normalise_if_format_overflow=True) sh_out.write("+\n\n") sh_out.write('ln -s "%s" anode.pdb\n\n' % os.path.relpath(pdbin, wdir)) sh_out.write("anode anode\n") sh_out.close() call(cmd="sh", arg="./run_anode.sh", wdir=wdir) pha_file = os.path.join(wdir, "anode.pha") if os.path.isfile(pha_file): pha2mtz(pha_file, xs, os.path.join(wdir, "anode.pha.mtz")) print "Done. See %s/" % wdir fa_file = os.path.join(wdir, "anode_fa.hkl") if os.path.isfile(fa_file): r = iotbx.shelx.hklf.reader(open(fa_file)) fa_array = r.as_miller_arrays(crystal_symmetry=xs)[0] print "\nData stats:" print " # Cmpl.o = Anomalous completeness in original data" print " # Cmpl.c = Anomalous completeness in shelxc result (rejections)" print " # SigAno = <d''/sigma> in shelxc result" print " d_max d_min Cmpl.o Cmpl.c SigAno" binner = obs_array.setup_binner(n_bins=12) for i_bin in binner.range_used(): d_max_bin, d_min_bin = binner.bin_d_range(i_bin) obs_sel = obs_array.resolution_filter(d_max_bin, d_min_bin) obs_sel_ano = obs_sel.anomalous_differences() fa_sel = fa_array.resolution_filter(d_max_bin, d_min_bin) cmplset = obs_sel_ano.complete_set( d_max=d_max_bin, d_min=d_min_bin).select_acentric() n_acentric = cmplset.size() sigano = flex.mean( fa_sel.data() / fa_sel.sigmas()) if fa_sel.size() else float("nan") print " %5.2f %5.2f %6.2f %6.2f %6.2f" % ( d_max_bin, d_min_bin, 100. * obs_sel_ano.size() / n_acentric, 100. * fa_sel.size() / n_acentric, sigano) lsa_file = os.path.join(wdir, "anode.lsa") if os.path.isfile(lsa_file): print "" flag = False for l in open(lsa_file): if "Strongest unique anomalous peaks" in l: flag = True elif "Reflections written to" in l: flag = False if flag: print l.rstrip() if os.path.isfile(("anode_fa.res")): x = iotbx.shelx.cctbx_xray_structure_from(file=open("anode_fa.res")) open("anode_fa.pdb", "w").write(x.as_pdb_file())
def do_clustering(self, nproc=1, b_scale=False, use_normalized=False, cluster_method="ward", distance_eqn="sqrt(1-cc)", min_common_refs=3, html_maker=None): """ Using correlation as distance metric (for hierarchical clustering) https://stats.stackexchange.com/questions/165194/using-correlation-as-distance-metric-for-hierarchical-clustering Correlation "Distances" and Hierarchical Clustering http://research.stowers.org/mcm/efg/R/Visualization/cor-cluster/index.htm """ self.clusters = {} prefix = os.path.join(self.wdir, "cctable") assert (b_scale, use_normalized).count(True) <= 1 distance_eqns = { "sqrt(1-cc)": lambda x: numpy.sqrt(1. - x), "1-cc": lambda x: 1. - x, "sqrt(1-cc^2)": lambda x: numpy.sqrt(1. - x**2), } cc_to_distance = distance_eqns[ distance_eqn] # Fail when unknown options assert cluster_method in ("single", "complete", "average", "weighted", "centroid", "median", "ward" ) # available methods in scipy if len(self.arrays) < 2: print "WARNING: less than two data! can't do cc-based clustering" self.clusters[1] = [float("nan"), [0]] return # Absolute scaling using Wilson-B factor if b_scale: from mmtbx.scaling.matthews import p_vm_calculator from mmtbx.scaling.absolute_scaling import ml_iso_absolute_scaling ofs_wilson = open("%s_wilson_scales.dat" % prefix, "w") n_residues = p_vm_calculator(self.arrays.values()[0], 1, 0).best_guess ofs_wilson.write("# guessed n_residues= %d\n" % n_residues) ofs_wilson.write("file wilsonB\n") for f in self.arrays: arr = self.arrays[f] iso_scale_and_b = ml_iso_absolute_scaling(arr, n_residues, 0) wilson_b = iso_scale_and_b.b_wilson ofs_wilson.write("%s %.3f\n" % (f, wilson_b)) if wilson_b > 0: # Ignoring data with B<0? is a bad idea.. but how..? tmp = flex.exp(-2. * wilson_b * arr.unit_cell().d_star_sq(arr.indices()) / 4.) self.arrays[f] = arr.customized_copy(data=arr.data() * tmp, sigmas=arr.sigmas() * tmp) ofs_wilson.close() elif use_normalized: from mmtbx.scaling.absolute_scaling import kernel_normalisation failed = {} for f in self.arrays: arr = self.arrays[f] try: normaliser = kernel_normalisation(arr, auto_kernel=True) self.arrays[f] = arr.customized_copy( data=arr.data() / normaliser.normalizer_for_miller_array, sigmas=arr.sigmas() / normaliser.normalizer_for_miller_array) except Exception, e: failed.setdefault(e.message, []).append(f) if failed: msg = "" for r in failed: msg += " %s\n%s\n" % (r, "\n".join( map(lambda x: " %s" % x, failed[r]))) raise Sorry( "intensity normalization failed by following reason(s):\n%s" % msg)
def do_clustering(self, nproc=1, b_scale=False, use_normalized=False, html_maker=None): self.clusters = {} prefix = os.path.join(self.wdir, "cctable") assert (b_scale, use_normalized).count(True) <= 1 if len(self.arrays) < 2: print "WARNING: less than two data! can't do cc-based clustering" self.clusters[1] = [float("nan"), [0]] return # Absolute scaling using Wilson-B factor if b_scale: from mmtbx.scaling.matthews import p_vm_calculator from mmtbx.scaling.absolute_scaling import ml_iso_absolute_scaling ofs_wilson = open("%s_wilson_scales.dat"%prefix, "w") n_residues = p_vm_calculator(self.arrays.values()[0], 1, 0).best_guess ofs_wilson.write("# guessed n_residues= %d\n" % n_residues) ofs_wilson.write("file wilsonB\n") for f in self.arrays: arr = self.arrays[f] iso_scale_and_b = ml_iso_absolute_scaling(arr, n_residues, 0) wilson_b = iso_scale_and_b.b_wilson ofs_wilson.write("%s %.3f\n" % (f, wilson_b)) if wilson_b > 0: # Ignoring data with B<0? is a bad idea.. but how..? tmp = flex.exp(-2. * wilson_b * arr.unit_cell().d_star_sq(arr.indices())/4.) self.arrays[f] = arr.customized_copy(data=arr.data()*tmp, sigmas=arr.sigmas()*tmp) ofs_wilson.close() elif use_normalized: from mmtbx.scaling.absolute_scaling import kernel_normalisation for f in self.arrays: arr = self.arrays[f] normaliser = kernel_normalisation(arr, auto_kernel=True) self.arrays[f] = arr.customized_copy(data=arr.data()/normaliser.normalizer_for_miller_array, sigmas=arr.sigmas()/normaliser.normalizer_for_miller_array) # Prep args = [] for i in xrange(len(self.arrays)-1): for j in xrange(i+1, len(self.arrays)): args.append((i,j)) # Calc all CC worker = lambda x: calc_cc(self.arrays.values()[x[0]], self.arrays.values()[x[1]]) results = easy_mp.pool_map(fixed_func=worker, args=args, processes=nproc) # Check NaN and decide which data to remove idx_bad = {} nans = [] cc_data_for_html = [] for (i,j), (cc,nref) in zip(args, results): cc_data_for_html.append((i,j,cc,nref)) if cc==cc: continue idx_bad[i] = idx_bad.get(i, 0) + 1 idx_bad[j] = idx_bad.get(j, 0) + 1 nans.append([i,j]) if html_maker is not None: html_maker.add_cc_clustering_details(cc_data_for_html) idx_bad = idx_bad.items() idx_bad.sort(key=lambda x:x[1]) remove_idxes = set() for idx, badcount in reversed(idx_bad): if len(filter(lambda x: idx in x, nans)) == 0: continue remove_idxes.add(idx) nans = filter(lambda x: idx not in x, nans) if len(nans) == 0: break use_idxes = filter(lambda x: x not in remove_idxes, xrange(len(self.arrays))) # Make table: original index (in file list) -> new index (in matrix) count = 0 org2now = collections.OrderedDict() for i in xrange(len(self.arrays)): if i in remove_idxes: continue org2now[i] = count count += 1 if len(remove_idxes) > 0: open("%s_notused.lst"%prefix, "w").write("\n".join(map(lambda x: self.arrays.keys()[x], remove_idxes))) # Make matrix mat = numpy.zeros(shape=(len(use_idxes), len(use_idxes))) for (i,j), (cc,nref) in zip(args, results): if i in remove_idxes or j in remove_idxes: continue mat[org2now[j], org2now[i]] = cc open("%s.matrix"%prefix, "w").write(" ".join(map(lambda x:"%.4f"%x, mat.flatten()))) ofs = open("%s.dat"%prefix, "w") ofs.write(" i j cc nref\n") for (i,j), (cc,nref) in zip(args, results): ofs.write("%4d %4d %.4f %4d\n" % (i,j,cc,nref)) open("%s_ana.R"%prefix, "w").write("""\ treeToList2 <- function(htree) { # stolen from $CCP4/share/blend/R/blend0.R groups <- list() itree <- dim(htree$merge)[1] for (i in 1:itree) { il <- htree$merge[i,1] ir <- htree$merge[i,2] if (il < 0) lab1 <- htree$labels[-il] if (ir < 0) lab2 <- htree$labels[-ir] if (il > 0) lab1 <- groups[[il]] if (ir > 0) lab2 <- groups[[ir]] lab <- c(lab1,lab2) lab <- as.integer(lab) groups <- c(groups,list(lab)) } return(groups) } cc<-scan("%(prefix)s.matrix") md<-matrix(1-cc, ncol=%(ncol)d, byrow=TRUE) hc <- hclust(as.dist(md),method="ward") pdf("tree.pdf") plot(hc) dev.off() png("tree.png",height=1000,width=1000) plot(hc) dev.off() hc$labels <- c(%(hclabels)s) groups <- treeToList2(hc) cat("ClNumber Nds Clheight IDs\\n",file="./CLUSTERS.txt") for (i in 1:length(groups)) { sorted_groups <- sort(groups[[i]]) linea <- sprintf("%%04d %%4d %%7.3f %%s\\n", i,length(groups[[i]]),hc$height[i], paste(sorted_groups,collapse=" ")) cat(linea, file="./CLUSTERS.txt", append=TRUE) } # reference: http://www.coppelia.io/2014/07/converting-an-r-hclust-object-into-a-d3-js-dendrogram/ library(rjson) HCtoJSON<-function(hc){ labels<-hc$labels merge<-data.frame(hc$merge) for (i in (1:nrow(merge))) { if (merge[i,1]<0 & merge[i,2]<0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(list(name=labels[-merge[i,1]]),list(name=labels[-merge[i,2]])))")))} else if (merge[i,1]>0 & merge[i,2]<0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(node", merge[i,1], ", list(name=labels[-merge[i,2]])))")))} else if (merge[i,1]<0 & merge[i,2]>0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(list(name=labels[-merge[i,1]]), node", merge[i,2],"))")))} else if (merge[i,1]>0 & merge[i,2]>0) {eval(parse(text=paste0("node", i, "<-list(name=\\"", i, "\\", children=list(node",merge[i,1] , ", node" , merge[i,2]," ))")))} } eval(parse(text=paste0("JSON<-toJSON(node",nrow(merge), ")"))) return(JSON) } JSON<-HCtoJSON(hc) cat(JSON, file="dendro.json") q(save="yes") """ % dict(prefix=os.path.basename(prefix), ncol=len(self.arrays), hclabels=",".join(map(lambda x: "%d"%(x+1), org2now.keys())))) call(cmd="Rscript", arg="%s_ana.R" % os.path.basename(prefix), wdir=self.wdir) output = open(os.path.join(self.wdir, "CLUSTERS.txt")).readlines() for l in output[1:]: sp = l.split() clid, clheight, ids = sp[0], sp[2], sp[3:] self.clusters[int(clid)] = [float(clheight), map(int,ids)]