def cache_matrix(filename, nodes, shift=0, overwrite=False): if(path.exists(filename) and not overwrite): return(np.load(filename)["matrix"]) mat = calc_matrix(nodes, shift) for n in nodes: register_file_dependency(filename, n.trr_fn) np.savez(filename, matrix=mat, node_names=[n.name for n in nodes]) return(mat)
def cache_matrix(filename, nodes, shift=0, overwrite=False, fast=False): if(path.exists(filename) and not overwrite): return(np.load(filename)["matrix"]) t1 = time.time() mat = calc_matrix(nodes, shift, fast) t2 = time.time() print("Matrix calculation took %f seconds.")%(t2-t1) for n in nodes: register_file_dependency(filename, n.trr_fn) np.savez(filename, matrix=mat, node_names=[n.name for n in nodes]) return(mat)
def main(): options = options_desc.parse_args(sys.argv)[0] zgf_cleanup.main() pool = Pool() npz_file = np.load(pool.chi_mat_fn) chi_matrix = npz_file['matrix'] node_names = npz_file['node_names'] n_clusters = npz_file['n_clusters'] active_nodes = [Node(nn) for nn in node_names] # create and open dest_files, intialize counters for statistics dest_filenames = [ pool.analysis_dir + "cluster%d.trr" % (c + 1) for c in range(n_clusters) ] dest_files = [open(fn, "wb") for fn in dest_filenames] dest_frame_counters = np.zeros(n_clusters) # For each active node... for (i, n) in enumerate(active_nodes): # ... find the clusters to which it belongs (might be more than one)... belonging_clusters = np.argwhere( chi_matrix[i] > options.node_threshold) # ... and find all typical frames of this node. #TODO not an optimal solution... discuss # per default, we take every frame with above average weight frame_threshold = options.frame_threshold * 2 * np.mean(n.frameweights) typical_frame_nums = np.argwhere(n.frameweights > frame_threshold) # Go through the node's trajectory ... trr_in = TrrFile(n.trr_fn) curr_frame = trr_in.first_frame for i in typical_frame_nums: # ...stop at each typical frame... while (i != curr_frame.number): curr_frame = curr_frame.next() assert (curr_frame.number == i) #... and copy it into the dest_file of each belonging cluster. for c in belonging_clusters: dest_files[c].write(curr_frame.raw_data) dest_frame_counters[c] += 1 trr_in.close() # close source file # close dest_files for f in dest_files: f.close() del (dest_files) # desolvate cluster-trajectories 'in-place' if (not options.write_sol): for dest_fn in dest_filenames: tmp_fn = mktemp(suffix='.trr', dir=pool.analysis_dir) os.rename(dest_fn, tmp_fn) # works as both files are in same dir cmd = ["trjconv", "-f", tmp_fn, "-o", dest_fn, "-n", pool.ndx_fn] p = Popen(cmd, stdin=PIPE) p.communicate(input="MOI\n") assert (p.wait() == 0) os.remove(tmp_fn) # register dependencies for fn in dest_filenames: register_file_dependency(fn, pool.chi_mat_fn) # check number of written frames sys.stdout.write("Checking lenghts of written trajectories... ") for i in range(n_clusters): f = TrrFile(dest_filenames[i]) assert (f.count_frames() == dest_frame_counters[i]) f.close() print("done.") #output statistics print "\n### Extraction summary ###\nnode threshold: %1.1f, frame threshold: %1.1f" % ( options.node_threshold, options.frame_threshold) print "Cluster trajectories were written to %s:" % pool.analysis_dir for (c, f) in enumerate(dest_frame_counters): print "cluster%d.trr [%d frames] from node(s):" % (c + 1, f) print list(np.argwhere(chi_matrix[:, c] > options.node_threshold).flat)
def main(): options = options_desc.parse_args(sys.argv)[0] zgf_cleanup.main() pool = Pool() active_nodes = pool.where("isa_partition") if(options.ignore_failed): active_nodes = pool.where("isa_partition and not state=='mdrun-failed'") assert(len(active_nodes) == len(active_nodes.multilock())) # make sure we lock ALL nodes if active_nodes.where("'weight_direct' not in obs"): active_nodes.unlock() sys.exit("Matrix calculation not possible: Not all of the nodes have been reweighted.") print "\n### Getting S matrix ..." s_matrix = cache_matrix(pool.s_mat_fn, active_nodes, overwrite=options.overwrite_mat, fast=options.fast_mat) register_file_dependency(pool.s_mat_fn, pool.filename) node_weights = np.array([node.obs.weight_direct for node in active_nodes]) print "\n### Symmetrizing S matrix ..." (corr_s_matrix, corr_node_weights) = symmetrize(s_matrix, node_weights, correct_weights=True, error=float(options.error)) # store intermediate results register_file_dependency(pool.s_corr_mat_fn, pool.s_mat_fn) np.savez(pool.s_corr_mat_fn, matrix=corr_s_matrix, node_names=[n.name for n in active_nodes]) if options.export_matlab: savemat(pool.analysis_dir+"node_weights.mat", {"node_weights":node_weights, "node_weights_corrected":corr_node_weights}) savemat(pool.analysis_dir+"s_mats.mat", {"s_matrix":s_matrix, "s_matrix_corrected":corr_s_matrix}) for (n, cw) in zip(active_nodes, corr_node_weights): n.obs.weight_corrected = cw print "\n### Node weights after symmetrization of S matrix:" for n in active_nodes: print "%s: initial weight: %f, corrected weight: %f, weight change: %f" % (n.name, n.obs.weight_direct, n.obs.weight_corrected, abs(n.obs.weight_direct - n.obs.weight_corrected)) n.save() active_nodes.unlock() # calculate and sort eigenvalues in descending order (eigvalues, eigvectors) = np.linalg.eig(corr_s_matrix) argsorted_eigvalues = np.argsort(-eigvalues) eigvalues = eigvalues[argsorted_eigvalues] eigvectors = eigvectors[:, argsorted_eigvalues] gaps = np.abs(eigvalues[1:]-eigvalues[:-1]) gaps = np.append(gaps, 0.0) wgaps = gaps*eigvalues print "\n### Sorted eigenvalues of symmetrized S matrix:" for (idx, ev, gap, wgap) in zip(range(1, len(eigvalues)+1), eigvalues, gaps, wgaps): print "EV%04d: %f, gap to next: %f, EV-weighted gap to next: %f" % (idx, ev, gap, wgap) n_clusters = np.argmax(wgaps)+1 print "\n### Maximum gap %f after top %d eigenvalues." % (np.max(gaps), n_clusters) print "### Maximum EV-weighted gap %f after top %d eigenvalues." % (np.max(wgaps), np.argmax(wgaps)+1) sys.stdout.flush() if not options.auto_cluster: n_clusters = userinput("Please enter the number of clusters for PCCA+", "int", "x>0") print "### Using %d clusters for PCCA+ ..."%n_clusters if options.export_matlab: savemat(pool.analysis_dir+"evs.mat", {"evs":eigvectors}) # orthogonalize and normalize eigenvectors eigvectors = orthogonalize(eigvalues, eigvectors, corr_node_weights) # perform PCCA+ # First two return-values "c_f" and "indicator" are not needed (chi_matrix, rot_matrix) = cluster_by_isa(eigvectors, n_clusters)[2:] if(options.optimize_chi): print "\n### Optimizing chi matrix ..." outliers = 5 mean_weight = np.mean(corr_node_weights) threshold = mean_weight/100*outliers print "Light-weight node threshold (%d%% of mean corrected node weight): %.4f."%(outliers, threshold) # accumulate nodes for optimization edges = np.where(np.max(chi_matrix, axis=1) > 0.9999)[0] # edges of simplex heavies = np.where( corr_node_weights > threshold)[0] # heavy-weight nodes filtered_eigvectors = eigvectors[ np.union1d(edges, heavies) ] # perform the actual optimization rot_matrix = opt_soft(filtered_eigvectors, rot_matrix, n_clusters) chi_matrix = np.dot(eigvectors[:,:n_clusters], rot_matrix) # deal with light-weight nodes: shift and scale for i in np.where(corr_node_weights <= threshold)[0]: if(i in edges): print "Column %d belongs to (potentially dangerous) light-weight node, but its node is a simplex edge."%(i+1) continue print "Column %d is shifted and scaled."%(i+1) col_min = np.min( chi_matrix[i,:] ) chi_matrix[i,:] -= col_min chi_matrix[i,:] /= 1-(n_clusters*col_min) qc_matrix = np.dot( np.dot( np.linalg.inv(rot_matrix), np.diag(eigvalues[range(n_clusters)]) ), rot_matrix ) - np.eye(n_clusters) cluster_weights = rot_matrix[0] print "\n### Matrix numerics check" print "-- Q_c matrix row sums --" print np.sum(qc_matrix, axis=1) print "-- cluster weights: first column of rot_matrix --" print cluster_weights print "-- cluster weights: numpy.dot(node_weights, chi_matrix) --" print np.dot(corr_node_weights, chi_matrix) print "-- chi matrix column max values --" print np.max(chi_matrix, axis=0) print "-- chi matrix row sums --" print np.sum(chi_matrix, axis=1) # store final results np.savez(pool.chi_mat_fn, matrix=chi_matrix, n_clusters=n_clusters, node_names=[n.name for n in active_nodes]) np.savez(pool.qc_mat_fn, matrix=qc_matrix, n_clusters=n_clusters, node_names=[n.name for n in active_nodes], weights=cluster_weights) if options.export_matlab: savemat(pool.analysis_dir+"chi_mat.mat", {"chi_matrix":chi_matrix}) savemat(pool.analysis_dir+"qc_mat.mat", {"qc_matrix":qc_matrix, "weights":cluster_weights}) register_file_dependency(pool.chi_mat_fn, pool.s_corr_mat_fn) register_file_dependency(pool.qc_mat_fn, pool.s_corr_mat_fn) for fn in (pool.s_mat_fn, pool.s_corr_mat_fn): register_file_dependency(pool.chi_mat_fn, fn) register_file_dependency(pool.qc_mat_fn, fn) # touch analysis directory (triggering update in zgf_browser) atime = mtime = time.time() os.utime(pool.analysis_dir, (atime, mtime)) # show summary if(options.summary): print "\n### Preparing cluster summary ..." chi_threshold = 1E-3 from pprint import pformat for i in range(n_clusters): involved_nodes = [active_nodes[ni] for ni in np.argwhere(chi_matrix[:,i] > chi_threshold)] max_chi_node = active_nodes[ np.argmax(chi_matrix[:,i]) ] c_max = [] for c in pool.converter: coord_range = pool.coord_range(c) scale = c.plot_scale edges = scale(np.linspace(np.min(coord_range), np.max(coord_range), num=50)) hist_cluster = np.zeros(edges.size-1) for (n, chi) in zip([n for n in active_nodes], chi_matrix[:,i]): samples = scale( n.trajectory.getcoord(c) ) hist_node = np.histogram(samples, bins=edges, weights=n.frameweights, normed=True)[0] hist_cluster += n.obs.weight_corrected * hist_node * chi c_max.append( scale(np.linspace(np.min(coord_range), np.max(coord_range), num=50))[np.argmax(hist_cluster)] ) msg = "### Cluster %d (weight=%.4f, #involved nodes=%d, representative='%s'):"%(i+1, cluster_weights[i], len(involved_nodes), max_chi_node.name) print "\n"+msg print "-- internal coordinates --" print "%s"%pformat(["%.2f"%cm for cm in c_max]) print "-- involved nodes --" print "%s"%pformat([n.name for n in involved_nodes]) print "-"*len(msg)
def main(): options = options_desc.parse_args(sys.argv)[0] zgf_cleanup.main() pool = Pool() npz_file = np.load(pool.chi_mat_fn) chi_matrix = npz_file['matrix'] node_names = npz_file['node_names'] n_clusters = npz_file['n_clusters'] active_nodes = [Node(nn) for nn in node_names] # create and open dest_files, intialize counters for statistics dest_filenames = [ pool.analysis_dir+"cluster%d.trr"%(c+1) for c in range(n_clusters) ] dest_files = [ open(fn, "wb") for fn in dest_filenames ] dest_frame_counters = np.zeros(n_clusters) # For each active node... for (i, n) in enumerate(active_nodes): # ... find the clusters to which it belongs (might be more than one)... belonging_clusters = np.argwhere(chi_matrix[i] > options.node_threshold) # ... and find all typical frames of this node. #TODO not an optimal solution... discuss # per default, we take every frame with above average weight frame_threshold = options.frame_threshold*2*np.mean(n.frameweights) typical_frame_nums = np.argwhere(n.frameweights > frame_threshold) # Go through the node's trajectory ... trr_in = TrrFile(n.trr_fn) curr_frame = trr_in.first_frame for i in typical_frame_nums: # ...stop at each typical frame... while(i != curr_frame.number): curr_frame = curr_frame.next() assert(curr_frame.number == i) #... and copy it into the dest_file of each belonging cluster. for c in belonging_clusters: dest_files[c].write(curr_frame.raw_data) dest_frame_counters[c] += 1 trr_in.close() # close source file # close dest_files for f in dest_files: f.close() del(dest_files) # desolvate cluster-trajectories 'in-place' if(not options.write_sol): for dest_fn in dest_filenames: tmp_fn = mktemp(suffix='.trr', dir=pool.analysis_dir) os.rename(dest_fn, tmp_fn) # works as both files are in same dir cmd = ["trjconv", "-f", tmp_fn, "-o", dest_fn, "-n", pool.ndx_fn] p = Popen(cmd, stdin=PIPE) p.communicate(input="MOI\n") assert(p.wait() == 0) os.remove(tmp_fn) # register dependencies for fn in dest_filenames: register_file_dependency(fn, pool.chi_mat_fn) # check number of written frames sys.stdout.write("Checking lenghts of written trajectories... ") for i in range(n_clusters): f = TrrFile(dest_filenames[i]) assert(f.count_frames() == dest_frame_counters[i]) f.close() print("done.") #output statistics print "\n### Extraction summary ###\nnode threshold: %1.1f, frame threshold: %1.1f"%(options.node_threshold, options.frame_threshold) print "Cluster trajectories were written to %s:"%pool.analysis_dir for (c, f) in enumerate(dest_frame_counters): print "cluster%d.trr [%d frames] from node(s):"%(c+1, f) print list(np.argwhere(chi_matrix[:,c] > options.node_threshold).flat)
def main(): options = options_desc.parse_args(sys.argv)[0] zgf_cleanup.main() pool = Pool() active_nodes = pool.where("isa_partition") assert(len(active_nodes) == len(active_nodes.multilock())) # make sure we lock ALL nodes if active_nodes.where("'weight_direct' not in obs"): active_nodes.unlock() sys.exit("Matrix calculation not possible: Not all of the nodes have been reweighted.") print "\n### Getting S matrix ..." s_matrix = cache_matrix(pool.s_mat_fn, active_nodes, overwrite=options.overwrite_mat) register_file_dependency(pool.s_mat_fn, pool.filename) print "\n### Getting K matrix ..." k_matrix = cache_matrix(pool.k_mat_fn, active_nodes, shift=options.lag_time, overwrite=options.overwrite_mat) register_file_dependency(pool.k_mat_fn, pool.filename) node_weights = np.array([node.obs.weight_direct for node in active_nodes]) print "\n### Symmetrizing S matrix ..." (corr_s_matrix, corr_node_weights) = symmetrize(s_matrix, node_weights, correct_weights=True, error=float(options.error)) print "\n### Symmetrizing K matrix ..." (corr_k_matrix, corr_node_weights) = symmetrize(k_matrix, corr_node_weights) # store intermediate results register_file_dependency(pool.s_corr_mat_fn, pool.s_mat_fn) register_file_dependency(pool.k_corr_mat_fn, pool.k_mat_fn) np.savez(pool.s_corr_mat_fn, matrix=corr_s_matrix, node_names=[n.name for n in active_nodes]) np.savez(pool.k_corr_mat_fn, matrix=corr_k_matrix, node_names=[n.name for n in active_nodes]) if options.export_matlab: savemat(pool.analysis_dir+"node_weights.mat", {"node_weights":node_weights, "node_weights_corrected":corr_node_weights}) savemat(pool.analysis_dir+"s_mats.mat", {"s_matrix":s_matrix, "s_matrix_corrected":corr_s_matrix}) savemat(pool.analysis_dir+"k_mats.mat", {"k_matrix":k_matrix, "k_matrix_corrected":corr_k_matrix}) for (n, cw) in zip(active_nodes, corr_node_weights): n.obs.weight_corrected = cw print "\n### Node weights after symmetrization of S matrix:" for n in active_nodes: print "%s: initial weight: %f, corrected weight: %f, weight change: %f" % (n.name, n.obs.weight_direct, n.obs.weight_corrected, abs(n.obs.weight_direct - n.obs.weight_corrected)) n.save() active_nodes.unlock() # calculate and sort eigenvalues in descending order (eigvalues, eigvectors) = np.linalg.eig(corr_s_matrix) argsorted_eigvalues = np.argsort(-eigvalues) eigvalues = eigvalues[argsorted_eigvalues] eigvectors = eigvectors[:, argsorted_eigvalues] gaps = np.abs(eigvalues[1:]-eigvalues[:-1]) gaps = np.append(gaps, 0.0) wgaps = gaps*eigvalues print "\n### Sorted eigenvalues of symmetrized S matrix:" for (idx, ev, gap, wgap) in zip(range(1, len(eigvalues)+1), eigvalues, gaps, wgaps): print "EV%04d: %f, gap to next: %f, EV-weighted gap to next: %f" % (idx, ev, gap, wgap) n_clusters = np.argmax(wgaps)+1 print "\n### Maximum gap %f after top %d eigenvalues." % (np.max(gaps), n_clusters) print "### Maximum EV-weighted gap %f after top %d eigenvalues." % (np.max(wgaps), np.argmax(wgaps)+1) sys.stdout.flush() if not options.auto_cluster: n_clusters = userinput("Please enter the number of clusters for PCCA+", "int", "x>0") print "### Using %d clusters for PCCA+ ..."%n_clusters print "eigenvectors" print eigvectors[:, :n_clusters] if options.export_matlab: savemat(pool.analysis_dir+"evs.mat", {"evs":eigvectors}) # orthogonalize and normalize eigenvectors eigvectors = orthogonalize(eigvalues, eigvectors, corr_node_weights) # perform PCCA+ # First two return-values "c_f" and "indicator" are not needed (chi_matrix, rot_matrix) = cluster_by_isa(eigvectors, n_clusters)[2:] #TODO at the moment, K-matrix is not used #xi = [] # calculate eigenvalues of Q_c, xi #for eigvec in np.transpose(eigvectors)[: n_clusters]: # num = np.dot( np.dot( np.transpose(eigvec), corr_k_matrix ), eigvec ) # denom = np.dot( np.dot( np.transpose(eigvec), corr_s_matrix ), eigvec ) # xi.append(num/denom-1) #print np.diag(xi) #TODO what does this tell us? Marcus-check qc_matrix = np.dot( np.dot( np.linalg.inv(rot_matrix), np.diag(eigvalues[range(n_clusters)]) ), rot_matrix ) - np.eye(n_clusters) cluster_weights = rot_matrix[0] print "Q_c matrix:" print qc_matrix print "Q_c matrix row sums:" print np.sum(qc_matrix, axis=1) print "cluster weights (calculated twice for checking):" print cluster_weights print np.dot(corr_node_weights, chi_matrix) print "chi matrix column sums:" print np.sum(chi_matrix, axis=0) print "chi matrix row sums:" print np.sum(chi_matrix, axis=1) # store final results np.savez(pool.chi_mat_fn, matrix=chi_matrix, n_clusters=n_clusters, node_names=[n.name for n in active_nodes]) np.savez(pool.qc_mat_fn, matrix=qc_matrix, n_clusters=n_clusters, node_names=[n.name for n in active_nodes], weights=cluster_weights) if options.export_matlab: savemat(pool.analysis_dir+"chi_mat.mat", {"chi_matrix":chi_matrix}) savemat(pool.analysis_dir+"qc_mat.mat", {"qc_matrix":qc_matrix, "weights":cluster_weights}) register_file_dependency(pool.chi_mat_fn, pool.s_corr_mat_fn) register_file_dependency(pool.qc_mat_fn, pool.s_corr_mat_fn) for fn in (pool.s_mat_fn, pool.s_corr_mat_fn, pool.k_mat_fn, pool.k_corr_mat_fn): register_file_dependency(pool.chi_mat_fn, fn) register_file_dependency(pool.qc_mat_fn, fn) zgf_cleanup.main()