コード例 #1
0
def cache_matrix(filename, nodes, shift=0, overwrite=False):
	if(path.exists(filename) and not overwrite):
		return(np.load(filename)["matrix"])
	mat = calc_matrix(nodes, shift)
	for n in nodes:
		register_file_dependency(filename, n.trr_fn)
	np.savez(filename, matrix=mat, node_names=[n.name for n in nodes])
	return(mat)
コード例 #2
0
ファイル: zgf_analyze.py プロジェクト: CMD-at-ZIB/ZIBMolPy
def cache_matrix(filename, nodes, shift=0, overwrite=False, fast=False):
	if(path.exists(filename) and not overwrite):
		return(np.load(filename)["matrix"])
	t1 = time.time()
	mat = calc_matrix(nodes, shift, fast)
	t2 = time.time()
	print("Matrix calculation took %f seconds.")%(t2-t1)
	for n in nodes:
		register_file_dependency(filename, n.trr_fn)
	np.savez(filename, matrix=mat, node_names=[n.name for n in nodes])
	return(mat)
コード例 #3
0
def main():
    options = options_desc.parse_args(sys.argv)[0]

    zgf_cleanup.main()

    pool = Pool()
    npz_file = np.load(pool.chi_mat_fn)
    chi_matrix = npz_file['matrix']
    node_names = npz_file['node_names']
    n_clusters = npz_file['n_clusters']
    active_nodes = [Node(nn) for nn in node_names]

    # create and open dest_files, intialize counters for statistics
    dest_filenames = [
        pool.analysis_dir + "cluster%d.trr" % (c + 1)
        for c in range(n_clusters)
    ]
    dest_files = [open(fn, "wb") for fn in dest_filenames]
    dest_frame_counters = np.zeros(n_clusters)

    # For each active node...
    for (i, n) in enumerate(active_nodes):
        # ... find the clusters to which it belongs (might be more than one)...
        belonging_clusters = np.argwhere(
            chi_matrix[i] > options.node_threshold)

        # ... and find all typical frames of this node.
        #TODO not an optimal solution... discuss
        # per default, we take every frame with above average weight
        frame_threshold = options.frame_threshold * 2 * np.mean(n.frameweights)
        typical_frame_nums = np.argwhere(n.frameweights > frame_threshold)

        # Go through the node's trajectory ...
        trr_in = TrrFile(n.trr_fn)
        curr_frame = trr_in.first_frame
        for i in typical_frame_nums:
            # ...stop at each typical frame...
            while (i != curr_frame.number):
                curr_frame = curr_frame.next()
            assert (curr_frame.number == i)
            #... and copy it into the dest_file of each belonging cluster.
            for c in belonging_clusters:
                dest_files[c].write(curr_frame.raw_data)
                dest_frame_counters[c] += 1
        trr_in.close()  # close source file

    # close dest_files
    for f in dest_files:
        f.close()
    del (dest_files)

    # desolvate cluster-trajectories 'in-place'
    if (not options.write_sol):
        for dest_fn in dest_filenames:
            tmp_fn = mktemp(suffix='.trr', dir=pool.analysis_dir)
            os.rename(dest_fn, tmp_fn)  # works as both files are in same dir
            cmd = ["trjconv", "-f", tmp_fn, "-o", dest_fn, "-n", pool.ndx_fn]
            p = Popen(cmd, stdin=PIPE)
            p.communicate(input="MOI\n")
            assert (p.wait() == 0)
            os.remove(tmp_fn)

    # register dependencies
    for fn in dest_filenames:
        register_file_dependency(fn, pool.chi_mat_fn)

    # check number of written frames
    sys.stdout.write("Checking lenghts of written trajectories... ")
    for i in range(n_clusters):
        f = TrrFile(dest_filenames[i])
        assert (f.count_frames() == dest_frame_counters[i])
        f.close()
    print("done.")

    #output statistics
    print "\n### Extraction summary ###\nnode threshold: %1.1f, frame threshold: %1.1f" % (
        options.node_threshold, options.frame_threshold)
    print "Cluster trajectories were written to %s:" % pool.analysis_dir
    for (c, f) in enumerate(dest_frame_counters):
        print "cluster%d.trr [%d frames] from node(s):" % (c + 1, f)
        print list(np.argwhere(chi_matrix[:, c] > options.node_threshold).flat)
コード例 #4
0
ファイル: zgf_analyze.py プロジェクト: CMD-at-ZIB/ZIBMolPy
def main():
	options = options_desc.parse_args(sys.argv)[0]

	zgf_cleanup.main()
	
	pool = Pool()
	active_nodes = pool.where("isa_partition")
	if(options.ignore_failed):
			active_nodes = pool.where("isa_partition and not state=='mdrun-failed'")

	assert(len(active_nodes) == len(active_nodes.multilock())) # make sure we lock ALL nodes

	if active_nodes.where("'weight_direct' not in obs"):
		active_nodes.unlock()
		sys.exit("Matrix calculation not possible: Not all of the nodes have been reweighted.")
	
	print "\n### Getting S matrix ..."
	s_matrix = cache_matrix(pool.s_mat_fn, active_nodes, overwrite=options.overwrite_mat, fast=options.fast_mat)
	register_file_dependency(pool.s_mat_fn, pool.filename)

	node_weights = np.array([node.obs.weight_direct for node in active_nodes])
	
	print "\n### Symmetrizing S matrix ..."
	(corr_s_matrix, corr_node_weights) = symmetrize(s_matrix, node_weights, correct_weights=True, error=float(options.error))

	# store intermediate results
	register_file_dependency(pool.s_corr_mat_fn, pool.s_mat_fn)

	np.savez(pool.s_corr_mat_fn, matrix=corr_s_matrix, node_names=[n.name for n in active_nodes])
	
	if options.export_matlab:
		savemat(pool.analysis_dir+"node_weights.mat", {"node_weights":node_weights, "node_weights_corrected":corr_node_weights})
		savemat(pool.analysis_dir+"s_mats.mat", {"s_matrix":s_matrix, "s_matrix_corrected":corr_s_matrix})

	for (n, cw) in zip(active_nodes, corr_node_weights):
		n.obs.weight_corrected = cw
		
	print "\n### Node weights after symmetrization of S matrix:"
	for n in active_nodes:
		print "%s: initial weight: %f, corrected weight: %f, weight change: %f" % (n.name, n.obs.weight_direct, n.obs.weight_corrected, abs(n.obs.weight_direct - n.obs.weight_corrected))
		n.save()

	active_nodes.unlock()

	# calculate and sort eigenvalues in descending order
	(eigvalues, eigvectors) = np.linalg.eig(corr_s_matrix)
	argsorted_eigvalues = np.argsort(-eigvalues)
	eigvalues = eigvalues[argsorted_eigvalues]
	eigvectors = eigvectors[:, argsorted_eigvalues]
	
	gaps = np.abs(eigvalues[1:]-eigvalues[:-1])
	gaps = np.append(gaps, 0.0)
	wgaps = gaps*eigvalues

	print "\n### Sorted eigenvalues of symmetrized S matrix:"
	for (idx, ev, gap, wgap) in zip(range(1, len(eigvalues)+1), eigvalues, gaps, wgaps):
		print "EV%04d: %f, gap to next: %f, EV-weighted gap to next: %f" % (idx, ev, gap, wgap)
	n_clusters = np.argmax(wgaps)+1
	print "\n### Maximum gap %f after top %d eigenvalues." % (np.max(gaps), n_clusters)
	print "### Maximum EV-weighted gap %f after top %d eigenvalues." % (np.max(wgaps), np.argmax(wgaps)+1)
	sys.stdout.flush()
	if not options.auto_cluster:
		n_clusters = userinput("Please enter the number of clusters for PCCA+", "int", "x>0")
	print "### Using %d clusters for PCCA+ ..."%n_clusters

	if options.export_matlab:
		savemat(pool.analysis_dir+"evs.mat", {"evs":eigvectors})
	
	# orthogonalize and normalize eigenvectors 
	eigvectors = orthogonalize(eigvalues, eigvectors, corr_node_weights)

	# perform PCCA+
	# First two return-values "c_f" and "indicator" are not needed
	(chi_matrix, rot_matrix) = cluster_by_isa(eigvectors, n_clusters)[2:]

	if(options.optimize_chi):
		print "\n### Optimizing chi matrix ..."
		
		outliers = 5
		mean_weight = np.mean(corr_node_weights)
		threshold = mean_weight/100*outliers
		print "Light-weight node threshold (%d%% of mean corrected node weight): %.4f."%(outliers, threshold)

		# accumulate nodes for optimization
		edges = np.where(np.max(chi_matrix, axis=1) > 0.9999)[0] # edges of simplex
		heavies = np.where( corr_node_weights > threshold)[0] # heavy-weight nodes
		filtered_eigvectors = eigvectors[ np.union1d(edges, heavies) ]

		# perform the actual optimization
		rot_matrix = opt_soft(filtered_eigvectors, rot_matrix, n_clusters)

		chi_matrix = np.dot(eigvectors[:,:n_clusters], rot_matrix)
		
		# deal with light-weight nodes: shift and scale
		for i in np.where(corr_node_weights <= threshold)[0]:
			if(i in edges):
				print "Column %d belongs to (potentially dangerous) light-weight node, but its node is a simplex edge."%(i+1)
				continue
			print "Column %d is shifted and scaled."%(i+1)
			col_min = np.min( chi_matrix[i,:] )
			chi_matrix[i,:] -= col_min
			chi_matrix[i,:] /= 1-(n_clusters*col_min)
			
	qc_matrix = np.dot( np.dot( np.linalg.inv(rot_matrix), np.diag(eigvalues[range(n_clusters)]) ), rot_matrix ) - np.eye(n_clusters)
	cluster_weights = rot_matrix[0]
	
	print "\n### Matrix numerics check"
	print "-- Q_c matrix row sums --"
	print np.sum(qc_matrix, axis=1)
	print "-- cluster weights: first column of rot_matrix --"
	print cluster_weights
	print "-- cluster weights: numpy.dot(node_weights, chi_matrix) --"
	print np.dot(corr_node_weights, chi_matrix)
	print "-- chi matrix column max values --"
	print np.max(chi_matrix, axis=0)
	print "-- chi matrix row sums --"
	print np.sum(chi_matrix, axis=1)

	# store final results
	np.savez(pool.chi_mat_fn, matrix=chi_matrix, n_clusters=n_clusters, node_names=[n.name for n in active_nodes])
	np.savez(pool.qc_mat_fn,  matrix=qc_matrix,  n_clusters=n_clusters, node_names=[n.name for n in active_nodes], weights=cluster_weights)

	if options.export_matlab:		
		savemat(pool.analysis_dir+"chi_mat.mat", {"chi_matrix":chi_matrix})
		savemat(pool.analysis_dir+"qc_mat.mat", {"qc_matrix":qc_matrix, "weights":cluster_weights})

	register_file_dependency(pool.chi_mat_fn, pool.s_corr_mat_fn)
	register_file_dependency(pool.qc_mat_fn, pool.s_corr_mat_fn)

	for fn in (pool.s_mat_fn, pool.s_corr_mat_fn):
		register_file_dependency(pool.chi_mat_fn, fn)
		register_file_dependency(pool.qc_mat_fn, fn)

	# touch analysis directory (triggering update in zgf_browser)
	atime = mtime = time.time()
	os.utime(pool.analysis_dir, (atime, mtime))

	# show summary
	if(options.summary):
		print "\n### Preparing cluster summary ..."
		chi_threshold = 1E-3
		from pprint import pformat
	
		for i in range(n_clusters):
			involved_nodes = [active_nodes[ni] for ni in np.argwhere(chi_matrix[:,i] > chi_threshold)]
			max_chi_node = active_nodes[ np.argmax(chi_matrix[:,i]) ]
			c_max = []

			for c in  pool.converter:
				coord_range = pool.coord_range(c)
				scale = c.plot_scale
				edges = scale(np.linspace(np.min(coord_range), np.max(coord_range), num=50))
				hist_cluster = np.zeros(edges.size-1)

				for (n, chi) in zip([n for n in active_nodes], chi_matrix[:,i]):
					samples = scale( n.trajectory.getcoord(c) )
					hist_node = np.histogram(samples, bins=edges, weights=n.frameweights, normed=True)[0]
					hist_cluster += n.obs.weight_corrected * hist_node * chi

				c_max.append( scale(np.linspace(np.min(coord_range), np.max(coord_range), num=50))[np.argmax(hist_cluster)] )

			msg = "### Cluster %d (weight=%.4f, #involved nodes=%d, representative='%s'):"%(i+1, cluster_weights[i], len(involved_nodes), max_chi_node.name)
			print "\n"+msg
			print "-- internal coordinates --"
			print "%s"%pformat(["%.2f"%cm for cm in c_max])
			print "-- involved nodes --"
			print "%s"%pformat([n.name for n in involved_nodes])			
			print "-"*len(msg)
コード例 #5
0
def main():
	options = options_desc.parse_args(sys.argv)[0]

	zgf_cleanup.main()
	
	pool = Pool()
	npz_file = np.load(pool.chi_mat_fn)
	chi_matrix = npz_file['matrix']
	node_names = npz_file['node_names']
	n_clusters = npz_file['n_clusters']
	active_nodes = [Node(nn) for nn in node_names]
	
	# create and open dest_files, intialize counters for statistics
	dest_filenames = [ pool.analysis_dir+"cluster%d.trr"%(c+1) for c in range(n_clusters) ]
	dest_files = [ open(fn, "wb") for fn in dest_filenames ]
	dest_frame_counters = np.zeros(n_clusters)
	
	
	# For each active node...
	for (i, n) in enumerate(active_nodes):
		# ... find the clusters to which it belongs (might be more than one)...
		belonging_clusters = np.argwhere(chi_matrix[i] > options.node_threshold)
		
		# ... and find all typical frames of this node.
		#TODO not an optimal solution... discuss
		# per default, we take every frame with above average weight
		frame_threshold = options.frame_threshold*2*np.mean(n.frameweights)
		typical_frame_nums = np.argwhere(n.frameweights > frame_threshold)
		
		# Go through the node's trajectory ...
		trr_in = TrrFile(n.trr_fn)
		curr_frame = trr_in.first_frame
		for i in typical_frame_nums:
			# ...stop at each typical frame...
			while(i != curr_frame.number):
				curr_frame = curr_frame.next()
			assert(curr_frame.number == i)
			#... and copy it into the dest_file of each belonging cluster.
			for c in belonging_clusters:
				dest_files[c].write(curr_frame.raw_data)
				dest_frame_counters[c] += 1
		trr_in.close() # close source file


	# close dest_files
	for f in dest_files:
		f.close()
	del(dest_files)
	
	# desolvate cluster-trajectories 'in-place'
	if(not options.write_sol):
		for dest_fn in dest_filenames:
			tmp_fn = mktemp(suffix='.trr', dir=pool.analysis_dir)
			os.rename(dest_fn, tmp_fn) # works as both files are in same dir
			cmd = ["trjconv", "-f", tmp_fn, "-o", dest_fn, "-n", pool.ndx_fn]
			p = Popen(cmd, stdin=PIPE)
			p.communicate(input="MOI\n")
			assert(p.wait() == 0)
			os.remove(tmp_fn)
			
	# register dependencies
	for fn in dest_filenames:
		register_file_dependency(fn, pool.chi_mat_fn)
	
	# check number of written frames
	sys.stdout.write("Checking lenghts of written trajectories... ")
	for i in range(n_clusters):
		f = TrrFile(dest_filenames[i])
		assert(f.count_frames() == dest_frame_counters[i])
		f.close()
	print("done.")
	
	#output statistics
	print "\n### Extraction summary ###\nnode threshold: %1.1f, frame threshold: %1.1f"%(options.node_threshold, options.frame_threshold)
	print "Cluster trajectories were written to %s:"%pool.analysis_dir
	for (c, f) in enumerate(dest_frame_counters):
		print "cluster%d.trr [%d frames] from node(s):"%(c+1, f)
		print list(np.argwhere(chi_matrix[:,c] > options.node_threshold).flat)
コード例 #6
0
def main():
	options = options_desc.parse_args(sys.argv)[0]

	zgf_cleanup.main()
	
	pool = Pool()
	active_nodes = pool.where("isa_partition")
	
	assert(len(active_nodes) == len(active_nodes.multilock())) # make sure we lock ALL nodes

	if active_nodes.where("'weight_direct' not in obs"):
		active_nodes.unlock()
		sys.exit("Matrix calculation not possible: Not all of the nodes have been reweighted.")
	
	print "\n### Getting S matrix ..."
	s_matrix = cache_matrix(pool.s_mat_fn, active_nodes, overwrite=options.overwrite_mat)
	register_file_dependency(pool.s_mat_fn, pool.filename)

	print "\n### Getting K matrix ..."
	k_matrix = cache_matrix(pool.k_mat_fn, active_nodes, shift=options.lag_time, overwrite=options.overwrite_mat)
	register_file_dependency(pool.k_mat_fn, pool.filename)	

	node_weights = np.array([node.obs.weight_direct for node in active_nodes])
	
	print "\n### Symmetrizing S matrix ..."
	(corr_s_matrix, corr_node_weights) = symmetrize(s_matrix, node_weights, correct_weights=True, error=float(options.error))
	print "\n### Symmetrizing K matrix ..."
	(corr_k_matrix, corr_node_weights) = symmetrize(k_matrix, corr_node_weights)

	# store intermediate results
	register_file_dependency(pool.s_corr_mat_fn, pool.s_mat_fn)
	register_file_dependency(pool.k_corr_mat_fn, pool.k_mat_fn)
	np.savez(pool.s_corr_mat_fn, matrix=corr_s_matrix, node_names=[n.name for n in active_nodes])
	np.savez(pool.k_corr_mat_fn, matrix=corr_k_matrix, node_names=[n.name for n in active_nodes])
	
	if options.export_matlab:
		savemat(pool.analysis_dir+"node_weights.mat", {"node_weights":node_weights, "node_weights_corrected":corr_node_weights})
		savemat(pool.analysis_dir+"s_mats.mat", {"s_matrix":s_matrix, "s_matrix_corrected":corr_s_matrix})
		savemat(pool.analysis_dir+"k_mats.mat", {"k_matrix":k_matrix, "k_matrix_corrected":corr_k_matrix})
	
	for (n, cw) in zip(active_nodes, corr_node_weights):
		n.obs.weight_corrected = cw
		
	print "\n### Node weights after symmetrization of S matrix:"
	for n in active_nodes:
		print "%s: initial weight: %f, corrected weight: %f, weight change: %f" % (n.name, n.obs.weight_direct, n.obs.weight_corrected, abs(n.obs.weight_direct - n.obs.weight_corrected))
		n.save()

	active_nodes.unlock()

	# calculate and sort eigenvalues in descending order
	(eigvalues, eigvectors) = np.linalg.eig(corr_s_matrix)
	argsorted_eigvalues = np.argsort(-eigvalues)
	eigvalues = eigvalues[argsorted_eigvalues]
	eigvectors = eigvectors[:, argsorted_eigvalues]
	
	gaps = np.abs(eigvalues[1:]-eigvalues[:-1])
	gaps = np.append(gaps, 0.0)
	wgaps = gaps*eigvalues

	print "\n### Sorted eigenvalues of symmetrized S matrix:"
	for (idx, ev, gap, wgap) in zip(range(1, len(eigvalues)+1), eigvalues, gaps, wgaps):
		print "EV%04d: %f, gap to next: %f, EV-weighted gap to next: %f" % (idx, ev, gap, wgap)
	n_clusters = np.argmax(wgaps)+1
	print "\n### Maximum gap %f after top %d eigenvalues." % (np.max(gaps), n_clusters)
	print "### Maximum EV-weighted gap %f after top %d eigenvalues." % (np.max(wgaps), np.argmax(wgaps)+1)
	sys.stdout.flush()
	if not options.auto_cluster:
		n_clusters = userinput("Please enter the number of clusters for PCCA+", "int", "x>0")
	print "### Using %d clusters for PCCA+ ..."%n_clusters

	print "eigenvectors"
	print eigvectors[:, :n_clusters]

	if options.export_matlab:
		savemat(pool.analysis_dir+"evs.mat", {"evs":eigvectors})
	
	# orthogonalize and normalize eigenvectors 
	eigvectors = orthogonalize(eigvalues, eigvectors, corr_node_weights)

	# perform PCCA+
	# First two return-values "c_f" and "indicator" are not needed
	(chi_matrix, rot_matrix) = cluster_by_isa(eigvectors, n_clusters)[2:]
	
	#TODO at the moment, K-matrix is not used
	#xi = [] # calculate eigenvalues of Q_c, xi
	#for eigvec in np.transpose(eigvectors)[: n_clusters]:
	#	num = np.dot( np.dot( np.transpose(eigvec), corr_k_matrix ), eigvec )
	#	denom = np.dot( np.dot( np.transpose(eigvec), corr_s_matrix ), eigvec )
	#	xi.append(num/denom-1)

	#print np.diag(xi) #TODO what does this tell us? Marcus-check

	qc_matrix = np.dot( np.dot( np.linalg.inv(rot_matrix), np.diag(eigvalues[range(n_clusters)]) ), rot_matrix ) - np.eye(n_clusters)

	cluster_weights = rot_matrix[0]

	print "Q_c matrix:"
	print qc_matrix
	print "Q_c matrix row sums:"
	print np.sum(qc_matrix, axis=1)
	print "cluster weights (calculated twice for checking):"
	print cluster_weights
	print np.dot(corr_node_weights, chi_matrix)
	print "chi matrix column sums:"
	print np.sum(chi_matrix, axis=0)
	print "chi matrix row sums:"
	print np.sum(chi_matrix, axis=1)

	# store final results
	np.savez(pool.chi_mat_fn, matrix=chi_matrix, n_clusters=n_clusters, node_names=[n.name for n in active_nodes])
	np.savez(pool.qc_mat_fn,  matrix=qc_matrix,  n_clusters=n_clusters, node_names=[n.name for n in active_nodes], weights=cluster_weights)

	if options.export_matlab:
		
		savemat(pool.analysis_dir+"chi_mat.mat", {"chi_matrix":chi_matrix})
		savemat(pool.analysis_dir+"qc_mat.mat", {"qc_matrix":qc_matrix, "weights":cluster_weights})

	register_file_dependency(pool.chi_mat_fn, pool.s_corr_mat_fn)
	register_file_dependency(pool.qc_mat_fn, pool.s_corr_mat_fn)
	for fn in (pool.s_mat_fn, pool.s_corr_mat_fn, pool.k_mat_fn, pool.k_corr_mat_fn):
		register_file_dependency(pool.chi_mat_fn, fn)
		register_file_dependency(pool.qc_mat_fn, fn)
		
	zgf_cleanup.main()