def main(metricCol, lig_resname, nTrajs, filter_val, stride, atomId, saving_frequency, trajectory_name, report_name, topology=None): folders = utilities.get_epoch_folders(".") data = [] minMetric = 1e6 confData = [] for epoch in folders: print("Processing epoch %s" % epoch) for iTraj in range(1, nTrajs): report = np.loadtxt("%s/%s_%d" % (epoch, report_name, iTraj)) if len(report.shape) < 2: report = report[np.newaxis, :] traj_file = glob.glob("%s/%s_%d.*" % (epoch, trajectory_name, iTraj))[0] snapshots = utilities.getSnapshots(traj_file, topology=topology) for i, snapshot in enumerate(itertools.islice(snapshots, 0, None, stride)): report_line = i * stride * saving_frequency data.append(get_coords(snapshot, atomId, lig_resname) + [report[report_line, metricCol]]) confData.append((epoch, iTraj, report_line)) data = np.array(data) minInd = np.argmin(data[:, -1]) minMetric = data[minInd, -1] data[:, -1] -= minMetric if filter_val is not None: data_filter = data.copy() data_filter[data_filter > filter_val] = filter_val namesPDB = utilities.write_PDB_clusters(data_filter, title="cluster_metric.pdb", use_beta=True) else: namesPDB = utilities.write_PDB_clusters(data, title="cluster_metric.pdb", use_beta=True) print("Min value for metric", minMetric, namesPDB[minInd]) with open("conformation_data.dat", "w") as fw: fw.write("PDB name Epoch Trajectory Snapshot COM x y z Metric\n") for j, name in enumerate(namesPDB): info = [name.rjust(8)]+[str(x).rjust(10) for x in confData[j]]+[str(np.round(d, 3)).rjust(7) for d in data[j, :-1]] + [str(np.round(data[j, -1], 2)).rjust(10)] fw.write("{:s} {:s} {:s} {:s} {:s} {:s} {:s} {:s}\n".format(*tuple(info)))
def cleanPreviousSimulation(output_path, allTrajs): """ Clean the uneeded data from a previous simulation :param output_path: Path where the data is stored :type output_path: str :param allTrajs: Path where the discretized trajectories for MSM are stored :type allTrajs: str """ equilibration_folders = glob.glob( os.path.join(output_path, "equilibration*")) for folder in equilibration_folders: try: shutil.rmtree(folder) except OSError as exc: if exc.errno != errno.ENOENT: raise # If another process deleted the folder between the glob and the # actual removing an OSError is raised epochs = utilities.get_epoch_folders(output_path) for epoch in epochs: try: shutil.rmtree(os.path.join(output_path, epoch)) except OSError as exc: if exc.errno != errno.ENOENT: raise try: shutil.rmtree(allTrajs) except OSError as exc: # this folder may not exist, in which case we just carry on pass
def main(col_energy, folder, out_report_name, format_out, nProcessors, output_folder, new_report, reportName, trajs_to_select): """ Calculate the relative SASA values of the ligand :param col_energy: Column corresponding to the energy in the reports :type col_energy: int :param folder: Path the simulation :type folder: str :param out_report_name: Name of the output file :type out_report_name: str :param format_out: String with the format of the output :type format_out: str :param nProcessors: Number of processors to use :type nProcessors: int :param output_folder: Path where to store the new reports :type output_folder: str :param new_report: Whether to create new reports :type new_report: bool """ # Constants if output_folder is not None: out_report_name = os.path.join(output_folder, out_report_name) outputFilename = "_".join([out_report_name, "%d"]) trajName = "*traj*" if reportName is None: reportName = "report_%d" else: reportName += "_%d" if nProcessors is None: nProcessors = utilities.getCpuCount() nProcessors = max(1, nProcessors) print("Standarizing energy with %d processors" % nProcessors) epochs = utilities.get_epoch_folders(folder) files = [] if not epochs: # path does not contain an adaptive simulation, we'll try to retrieve # trajectories from the specified path files = analysis_utils.process_folder( None, folder, trajName, reportName, os.path.join(folder, outputFilename), None, trajs_to_select) for epoch in epochs: print("Epoch", epoch) files.extend( analysis_utils.process_folder( epoch, folder, trajName, reportName, os.path.join(folder, epoch, outputFilename), None, trajs_to_select)) pool = mp.Pool(nProcessors) results = [ pool.apply_async(process_file, args=(info[1], info[4], format_out, new_report, info[3], col_energy)) for info in files ] pool.close() pool.join() for res in results: res.get()
def main(resname, folder, top, out_report_name, format_out, nProcessors, output_folder, new_report): """ Calculate the relative SASA values of the ligand :param resname: Ligand resname :type resname: str :param folder: Path the simulation :type folder: str :param top: Path to the topology :type top: str :param out_report_name: Name of the output file :type out_report_name: str :param format_out: String with the format of the output :type format_out: str :param nProcessors: Number of processors to use :type nProcessors: int :param output_folder: Path where to store the new reports :type output_folder: str :param new_report: Whether to create new reports :type new_report: bool """ # Constants if output_folder is not None: out_report_name = os.path.join(output_folder, out_report_name) outputFilename = "_".join([out_report_name, "%d"]) trajName = "*traj*" reportName = "*report*_%d" if nProcessors is None: nProcessors = utilities.getCpuCount() nProcessors = max(1, nProcessors) print("Calculating SASA with %d processors" % nProcessors) pool = mp.Pool(nProcessors) epochs = utilities.get_epoch_folders(folder) if top is not None: top_obj = getTopologyObject(top) else: top_obj = None files = [] if not epochs: # path does not contain an adaptive simulation, we'll try to retrieve # trajectories from the specified path files = process_folder(None, folder, trajName, reportName, os.path.join(folder, outputFilename), top_obj) for epoch in epochs: print("Epoch", epoch) files.extend( process_folder(epoch, folder, trajName, reportName, os.path.join(folder, epoch, outputFilename), top_obj)) results = [] for info in files: results.append( pool.apply_async(process_file, args=(info[0], info[2], resname, info[1], info[4], format_out, new_report, info[3]))) for res in results: res.get()
def main(ligand, clusters_file, conf_folder, topology=None): trajFolder = "allTrajs_nonRepeat" cluster_centers = np.loadtxt(clusters_file) if not os.path.exists("discretized"): os.makedirs("discretized") if not os.path.exists(trajFolder): os.makedirs(trajFolder) stride = 1 clusterCountsThreshold = 0 trajBasename = "coord*" if topology is not None: topology_contents = utilities.getTopologyFile(topology) else: topology_contents = None epoch_folders = utilities.get_epoch_folders(conf_folder) numClusters = cluster_centers.shape[0] coordinates = [[] for cl in range(numClusters)] for it in epoch_folders: files = glob.glob(conf_folder + "%s/extractedCoordinates/coord*" % it) for f in files: traj = os.path.splitext(f)[0].split("_")[-1] shutil.copy(f, trajFolder + "/coord_%s_%s.dat" % (it, traj)) clusteringObject = cluster.Cluster(numClusters, trajFolder, trajBasename, alwaysCluster=False, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) for i in range(numClusters): if not os.path.exists("cluster_%d" % i): os.makedirs("cluster_%d/allStructures" % i) dtrajs_files = glob.glob("discretized/*.disctraj") for dtraj in dtrajs_files: print(dtraj) traj = np.loadtxt(dtraj) epoch, traj_num = map(int, os.path.splitext(dtraj)[0].split("_", 3)[1:]) trajPositions = np.loadtxt(trajFolder + "/coord_%d_%d.dat" % (epoch, traj_num)) trajFile = glob.glob( os.path.join(conf_folder + "%d/trajectory_%d*" % (epoch, traj_num)))[0] snapshots = utilities.getSnapshots(trajFile, topology=topology) for nSnap, cluster_num in enumerate(traj): coordinates[int(cluster_num)].append(trajPositions[nSnap]) filename = "cluster_%d/allStructures/conf_%d_%d_%d.pdb" % ( cluster_num, epoch, traj_num, nSnap) if isinstance(snapshots[nSnap], basestring): with open(filename, "w") as fw: fw.write(snapshots[nSnap]) else: utilities.write_mdtraj_object_PDB(snapshots[nSnap], filename, topology_contents) for cl in range(numClusters): np.savetxt("cluster_%d/positions.dat" % cl, coordinates[cl])
def main(top_path): sim_folder = os.path.abspath(os.path.join(top_path, os.path.pardir)) epochs = utilities.get_epoch_folders(sim_folder) top = utilities.Topology(top_path) topology_files = glob.glob(os.path.join(top_path, "topology*.pdb")) topology_files.sort(key=utilities.getTrajNum) top.setTopologies(topology_files) for epoch in epochs: top.readMappingFromDisk(os.path.join(sim_folder, epoch), int(epoch)) top.writeTopologyObject()
def main(residues, folder, top, out_report_name, format_out, nProcessors, output_folder, new_report, trajs_to_select): """ Calculate the distances between paris of atoms :param residues: Pairs of atoms to calculate distances :type residues: list :param folder: Path the simulation :type folder: str :param top: Path to the topology :type top: str :param out_report_name: Name of the output file :type out_report_name: str :param format_out: String with the format of the output :type format_out: str :param nProcessors: Number of processors to use :type nProcessors: int :param output_folder: Path where to store the new reports :type output_folder: str :param new_report: Whether to create new reports :type new_report: bool :param trajs_to_select: Number of the reports to read, if don't want to select all :type trajs_to_select: set """ # Constants if output_folder is not None: out_report_name = os.path.join(output_folder, out_report_name) outputFilename = "_".join([out_report_name, "%d"]) trajName = "*traj*" reportName = "*report*_%d" distances_label = "\t".join(residues) residues = parse_selection(residues) if nProcessors is None: nProcessors = utilities.getCpuCount() nProcessors = max(1, nProcessors) print("Calculating distances with %d processors" % nProcessors) epochs = utilities.get_epoch_folders(folder) if top is not None: top_obj = utilities.getTopologyObject(top) else: top_obj = None files = [] if not epochs: # path does not contain an adaptive simulation, we'll try to retrieve # trajectories from the specified path files = analysis_utils.process_folder(None, folder, trajName, reportName, os.path.join(folder, outputFilename), top_obj, trajs_to_select) for epoch in epochs: print("Epoch", epoch) files.extend(analysis_utils.process_folder(epoch, folder, trajName, reportName, os.path.join(folder, epoch, outputFilename), top_obj, trajs_to_select)) print("Starting to process files!") pool = mp.Pool(nProcessors) results = [pool.apply_async(process_file, args=(info[0], info[2], residues, info[1], info[4], format_out, new_report, info[3], distances_label)) for info in files] pool.close() pool.join() for res in results: res.get()
def main(trajectory_name, path, n_processors, imaging): epochs = utilities.get_epoch_folders(path) to_process = [] pool = mp.Pool(n_processors) trajectory_glob = trajectory_name + "_*" for epoch in epochs: with open(os.path.join(epoch, "topologyMapping.txt")) as f: top_map = f.read().rstrip().split(":") for traj in glob.glob(os.path.join(path, epoch, trajectory_glob)): traj_num = utilities.getTrajNum(traj) to_process.append( (top_map[traj_num - 1], traj, epoch, traj_num, imaging)) pool.map(process_traj, to_process) pool.close() pool.terminate()
def main(num_clusters, output_folder, ligand_resname, atom_ids, folder_name=".", topology=None): extractCoords.main(folder_name, lig_resname=ligand_resname, non_Repeat=True, atom_Ids=atom_ids) trajectoryFolder = "allTrajs" trajectoryBasename = "traj*" stride = 1 clusterCountsThreshold = 0 folders = utilities.get_epoch_folders(folder_name) folders.sort(key=int) if os.path.exists("discretized"): # If there is a previous clustering, remove to cluster again shutil.rmtree("discretized") clusteringObject = cluster.Cluster(num_clusters, trajectoryFolder, trajectoryBasename, alwaysCluster=False, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) clusterCenters = clusteringObject.clusterCenters centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, clusterCenters) COMArray = [centersInfo[i]['center'][:3] for i in range(num_clusters)] if output_folder is not None: outputFolder = os.path.join(output_folder, "") if not os.path.exists(outputFolder): os.makedirs(outputFolder) else: outputFolder = "" writePDB( COMArray, outputFolder + "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters) writeInitialStructures(centersInfo, outputFolder + "initial_%d.pdb", topology=topology)
def main(sim_path, n_trajs, trajectory_name, plot_name, residues_selected): # since we remove the water molecules, any topology file will be fine info1, info2 = parse_selection(selected_res) cache_file = "distances.npy" if not os.path.exists(cache_file): global_traj = None trajectory_name = "_%d".join(os.path.splitext(trajectory_name)) epochs = utilities.get_epoch_folders(sim_path) for epoch in epochs: with open(os.path.join(sim_path, epoch, "topologyMapping.txt")) as f: top_map = f.read().rstrip().split(":") for i in range(1, n_trajs + 1): print("Processing epoch", epoch, "trajectory", i) trajectory = md.load(os.path.join(epoch, trajectory_name % i), top=os.path.join( sim_path, "topologies", "topology_%s.pdb" % top_map[i - 1])) if global_traj is None: global_traj = trajectory.remove_solvent() atom1 = global_traj.top.select( "resname '%s' and residue %s and name %s" % info1) atom2 = global_traj.top.select( "resname '%s' and residue %s and name %s" % info2) if atom1.size == 0 or atom2.size == 0: raise ValueError( "Nothing found under current selection") else: global_traj += trajectory.remove_solvent() distance = 10 * md.compute_distances(global_traj, [atom1.tolist() + atom2.tolist()]) np.save(cache_file, distance) else: distance = np.load(cache_file) f1, ax1 = plt.subplots(1, 1) ax1.plot(distance, 'x-') ax1.set_ylabel(r"Distance %s ($\AA$)" % residues_selected) if plot_name is not None: f1.savefig(plot_name) plt.show()
def main(metricCol, lig_resname, nTrajs, stride, atomId, saving_frequency): folders = utilities.get_epoch_folders(".") box_center = None templateLine = "HETATM%s H BOX Z 501 %s%s%s 0.75%s H \n" for epoch in folders: print("Processing epoch %s" % epoch) data = [] confData = [] maxEpoch = -1 maxEpochCoords = None for iTraj in range(1, nTrajs): report = np.loadtxt("%s/report_%d" % (epoch, iTraj)) if len(report.shape) < 2: report = report[np.newaxis, :] maxTrajIndex = np.argmax(report[:, metricCol]) snapshots = utilities.getSnapshots("%s/trajectory_%d.pdb" % (epoch, iTraj)) for i, snapshot in enumerate(itertools.islice(snapshots, 0, None, stride)): report_line = i * stride * saving_frequency data.append(get_coords(snapshot, atomId, lig_resname) + [report[report_line, metricCol]]) confData.append((epoch, iTraj, report_line)) if report[maxTrajIndex, metricCol] > maxEpoch: maxEpoch = report[maxTrajIndex, metricCol] maxEpochCoords = get_coords(snapshots[maxTrajIndex], atomId, lig_resname) if box_center is None and iTraj == 1: box_center = data[0][:3] data = np.array(data) minInd = np.argmin(data[:, -1]) minMetric = data[minInd, -1] data[:, -1] -= minMetric utilities.write_PDB_clusters(data, title="epoch_%s.pdb" % epoch, use_beta=True) print("Max value for metric", maxEpoch, maxEpochCoords) with open("epoch_%s.pdb" % epoch, "a") as fa: fa.write("TER\n") serial = ("%d" % data.shape[0]).rjust(5) x = ("%.3f" % box_center[0]).rjust(8) y = ("%.3f" % box_center[1]).rjust(8) z = ("%.3f" % box_center[2]).rjust(8) g = ("%.2f" % 0).rjust(6) fa.write(templateLine % (serial, x, y, z, g)) box_center = maxEpochCoords
def cleanPreviousSimulation(output_path): """ Clean the uneeded data from a previous simulation :param output_path: Path where the data is stored :type output_path: str """ equilibration_folders = glob.glob(os.path.join(output_path, "equilibration*")) for folder in equilibration_folders: try: shutil.rmtree(folder) except OSError as exc: if exc.errno != errno.ENOENT: raise # If another process deleted the folder between the glob and the # actual removing an OSError is raised epochs = utilities.get_epoch_folders(output_path) for epoch in epochs: try: shutil.rmtree(os.path.join(output_path, epoch)) except OSError as exc: if exc.errno != errno.ENOENT: raise
def createPlot(reportName, column1, column2, stepsPerRun, printWithLines, paletteModifier, trajs_range=None, label_x=None, label_y=None, label_colorbar=None, fig_size=(6, 6), simulation_path=".", skip_first_step=False, skip_steps=None, y_top=None, y_bottom=None, x_left=None, x_right=None): """ Generate a string to be passed to gnuplot :param reportName: Name of the files containing the simulation data :type reportName: str :param column1: Column to plot in the X axis :type column1: int :param column2: Column to plot in the Y axis :type column2: int :param stepsPerRun: Number of steps per epoch, :type stepsPerRun: int :param paletteModifier: Whether to use the epoch as color or a column :type paletteModifier: int :param trajs_range: Range of trajectories to plot :type trajs_range: str :param label_x: Label of the x-axis :type label_x: str :param label_y: Label of the y-axis :type label_y: str :param label_colorbar: Label of the colorbar :type label_colorbar: str :param fig_size: Size of the plot figure (default (6in, 6in)) :type fig_size: tuple :param simulation_path: Path to the simulation data :type simulation_path: str :param skip_first_step: Whether to avoid plotting the first point in each report :type skip_first_step: bool :param skip_steps: Number of steps to skip in the plot :type skip_steps: int :param y_bottom: Bottom limit of the y axis :type y_bottom: float :param y_top: Top limit of the y axis :type y_top: float :param x_left: Left limit of the x axis :type x_bottom: float :param x_right: Right limit of the x axis :type x_right: float """ epochs = utilities.get_epoch_folders(simulation_path) numberOfEpochs = int(len(epochs)) if numberOfEpochs == 0: raise ValueError("No simulation found in specified path ", os.path.abspath(simulation_path)) cmap_name = "viridis" dictionary = { 'reportName': reportName, 'col2': column2, 'numberOfEpochs': numberOfEpochs, 'col1': column1, 'withLines': printWithLines, 'color': paletteModifier } annotations = [] artists = [] trajectory_range = set() if trajs_range is not None: start, end = map(int, trajs_range.split(":")) trajectory_range = set(range(start, end + 1)) cmin = 1e10 cmax = -1e10 data_dict = {} max_report = 0 min_report = 1e10 for epoch in epochs: ep = int(epoch) reports = utilities.getReportList( os.path.join(simulation_path, epoch, reportName + "*")) if not reports: raise ValueError( "Could not find any reports with the given name!!") for report in reports: report_num = utilities.getReportNum(report) max_report = max(max_report, report_num) min_report = min(min_report, report_num) if trajs_range is not None and report_num not in trajectory_range: continue data = utilities.loadtxtfile(report) if skip_steps is not None: if data.shape[0] <= skip_steps: continue data = data[skip_steps:] elif skip_first_step: data = data[1:] if paletteModifier is not None and paletteModifier != -1: cmin = min(cmin, data[:, paletteModifier].min()) cmax = max(cmax, data[:, paletteModifier].max()) data_dict[(ep, report_num)] = data fig, ax = plt.subplots(figsize=fig_size) ticks = None if paletteModifier == -1: cmin = min_report cmax = max_report if paletteModifier is None: cmin = int(epochs[0]) cmax = int(epochs[-1]) ticks = range(cmin, cmax + 1) sm = plt.cm.ScalarMappable(cmap=plt.get_cmap(cmap_name), norm=plt.Normalize(vmin=cmin, vmax=cmax)) sm.set_array([]) dictionary['cmap'] = sm if paletteModifier != -1: cbar = plt.colorbar(sm, ticks=ticks) cbar.ax.zorder = -1 offset = 0 if skip_steps is not None: offset = skip_steps elif skip_first_step: # if we skipt the first step there is a point that is not shown but we # should count either way offset = 1 for el in data_dict: addLine(data_dict[el], el[1], el[0], stepsPerRun, dictionary, artists) annotations.append([ "Epoch: %d\nTrajectory: %d\nModel: %d" % (el[0], el[1], i + 1 + offset) for i in range(len(data_dict[el])) ]) if label_x is not None: plt.xlabel(label_x) if label_y is not None: plt.ylabel(label_y) if paletteModifier is None: cbar.set_label("Epoch") if label_colorbar is not None: cbar.set_label(label_colorbar) ax.set_ylim(bottom=y_bottom, top=y_top) ax.set_xlim(left=x_left, right=x_right) annot = ax.annotate("", xy=(0, 0), xytext=(20, 20), textcoords="offset points", bbox=dict(boxstyle="round", fc="w"), arrowprops=dict(arrowstyle="->")) annot.set_visible(False) def modify_color(color): color_offset = 0.5 color = list(color) for i in range(3): color[i] = min(color[i] + color_offset, 1) return tuple(color) def update_annot(ind, color, pos, index): """Update the information box of the selected point""" annot.xy = pos annot.set_text(annotations[index][int(ind["ind"][0])]) annot.get_bbox_patch().set_facecolor(modify_color(color)) annot.get_bbox_patch().set_alpha(0.8) annot.zorder = 10 def locate_event(event): for j, el in enumerate(artists): found, info = el.contains(event) if found: return j, found, info, el return 0, False, None, None def extract_data(obj_plot, ind): try: x, y = obj_plot.get_data() x = x[ind["ind"][0]] y = y[ind["ind"][0]] return (x, y) except AttributeError: return obj_plot.get_offsets()[ind["ind"][0]] def extract_color(obj_plot, ind): try: return obj_plot.get_markerfacecolor() except AttributeError: return obj_plot.get_facecolor()[ind["ind"][0]] def hover(event): """Action to perform when hovering the mouse on a point""" vis = annot.get_visible() if event.inaxes == ax: index, cont, ind, obj = locate_event(event) if cont: update_annot(ind, extract_color(obj, ind), extract_data(obj, ind), index) annot.set_visible(True) fig.canvas.draw_idle() else: if vis: annot.set_visible(False) fig.canvas.draw_idle() # Respond to mouse motion fig.canvas.mpl_connect("motion_notify_event", hover)
import os import scipy.optimize as optim from AdaptivePELE.utilities import utilities import matplotlib.pyplot as plt plt.style.use("ggplot") def reward_new(x, rews): return -(x * rews).sum() def reward(x, rews): return -(x[:, np.newaxis] * rews).sum() folders = utilities.get_epoch_folders(".") for folder in folders[::-1]: if os.path.exists(folder + "/clustering/object.pkl"): cl_object = utilities.readClusteringObject(folder + "/clustering/object.pkl") break # first_cluster = 0 trajToDivide = 144 * 2 rewardsEvol = [] weightsEvol = [] weightsEvol_new = [] weights = None weights_new = None metricInd = 4 labels = ["TE", "RMSD", "BE", "SASA"] plots = True
def main(controlFile, trajName, reportName, folder, top, outputFilename, nProcessors, output_folder, format_str, new_report, trajs_to_select): """ Calculate the corrected rmsd values of conformation taking into account molecule symmetries :param controlFile: Control file :type controlFile: str :param folder: Path the simulation :type folder: str :param top: Path to the topology :type top: str :param outputFilename: Name of the output file :type outputFilename: str :param nProcessors: Number of processors to use :type nProcessors: int :param output_folder: Path where to store the new reports :type output_folder: str :param format_str: String with the format of the report :type format_str: str :param new_report: Whether to write rmsd to a new report file :type new_report: bool """ if trajName is None: trajName = "*traj*" else: trajName += "_*" if reportName is None: reportName = "report_%d" else: reportName += "_%d" if output_folder is not None: outputFilename = os.path.join(output_folder, outputFilename) outputFilename += "_%d" if nProcessors is None: nProcessors = utilities.getCpuCount() nProcessors = max(1, nProcessors) print("Calculating RMSDs with %d processors" % nProcessors) epochs = utilities.get_epoch_folders(folder) if top is not None: top_obj = utilities.getTopologyObject(top) else: top_obj = None resname, nativeFilename, symmetries, rmsdColInReport = readControlFile(controlFile) nativePDB = atomset.PDB() nativePDB.initialise(nativeFilename, resname=resname) files = [] if not epochs: # path does not contain an adaptive simulation, we'll try to retrieve # trajectories from the specified path files = analysis_utils.process_folder(None, folder, trajName, reportName, os.path.join(folder, outputFilename), top_obj, trajs_to_select) for epoch in epochs: print("Epoch", epoch) files.extend(analysis_utils.process_folder(epoch, folder, trajName, reportName, os.path.join(folder, epoch, outputFilename), top_obj, trajs_to_select)) pool = mp.Pool(nProcessors) results = [pool.apply_async(calculate_rmsd_traj, args=(nativePDB, resname, symmetries, rmsdColInReport, info[0], info[1], info[2], info[3], info[4], format_str, new_report)) for info in files] pool.close() pool.join() for res in results: res.get()
def main(num_clusters, criteria1, criteria2, ligand_resname, output_folder="ClusterCentroids", atom_ids="", cpus=2, topology=None, report="report_", traj="trajectory_", use_pdb=False, png=False, CA=0, sidechains=0, restart="all"): # Create multiprocess pool if cpus > 1: pool = mp.Pool(cpus) else: pool = mp.Pool(1) # Extract COM ligand for each snapshot if not glob.glob("allTrajs/traj*"): extractCoords.main(lig_resname=ligand_resname, non_Repeat=True, atom_Ids=atom_ids, nProcessors=cpus, parallelize=True, topology=topology, protein_CA=CA, sidechains=sidechains) print("Clusterize trajectories by RMSD of COM") trajectoryFolder = "allTrajs" trajectoryBasename = "*traj*" stride = 1 clusterCountsThreshold = 0 folders = utilities.get_epoch_folders(".") folders.sort(key=int) if not restart: clusteringObject = cluster.Cluster(num_clusters, trajectoryFolder, trajectoryBasename, alwaysCluster=True, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) clusterCenters = clusteringObject.clusterCenters np.savetxt("clustercenters.dat", clusterCenters) dtrajs = clusteringObject.dtrajs print("Extract metrics for each snapshot") min_metric_trajs = {} epochs = [folder for folder in glob.glob("./*/") if folder.isdigit()] reports = simulationToCsv.gather_reports() fields = simulationToCsv.retrieve_fields(reports[0]) df = simulationToCsv.init_df(fields) df = simulationToCsv.fill_data(reports, df, pool) print("Update data with metrics and clusters") df.index = range(df.shape[0]) df["Cluster"] = [None] * df.shape[0] input_list = [[ df, Traj, d ] for d, Traj in zip(dtrajs, clusteringObject.trajFilenames)] results = pool.map(save_to_df, input_list) for data in results: for df_tmp in data: df.update(df_tmp) df.to_csv("Simulation.csv", index=False) if restart: df = pd.read_csv("Simulation.csv") clusterCenters = utilities.loadtxtfile("clustercenters.dat") print(clusterCenters) centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, clusterCenters) COMArray = [centersInfo[i]['center'] for i in range(num_clusters)] print("Retrieve clusters and metric") fields1 = [] fields2 = [] print(centersInfo) for cluster_num in centersInfo: epoch_num, traj_num, snap_num = map( int, centersInfo[cluster_num]['structure']) field1, crit1_name = get_metric(criteria1, epoch_num, traj_num, snap_num, report) field2, crit2_name = get_metric(criteria2, epoch_num, traj_num, snap_num, report) fields1.append(field1) fields2.append(field2) if output_folder is not None: outputFolder = os.path.join(output_folder, "") if not os.path.exists(outputFolder): os.makedirs(outputFolder) else: outputFolder = "" print("Output structures") writePDB( COMArray, outputFolder + "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters) writeInitialStructures(fields1, fields2, crit1_name, crit2_name, centersInfo, outputFolder + "cluster_{}_{}_{}_{}_{}.pdb", traj, topology=topology, use_pdb=use_pdb) plotClusters(fields1, fields2, crit1_name, crit2_name, outputFolder, png=png) assesClusterConvergence(df, num_clusters, traj, topology) return
def main(sim_path, n_trajs, trajectory_name, plot_name, residues_selected): # since we remove the water molecules, any topology file will be fine ref = md.load(os.path.join(sim_path, "topologies", "topology_0.pdb")) ref.remove_solvent(inplace=True) labels = [] selections = [] for res in ref.top.residues: if res.is_protein and (residues_selected is None or res.resSeq in residues_selected): if residues_selected is not None: residues_selected.remove(res.resSeq) labels.append("%s%d" % (res.code, res.resSeq)) selections.append( ref.top.select("protein and symbol != 'H' and residue %d" % res.resSeq)) if residues_selected is not None and len(residues_selected): raise ValueError("Residues %s not found in protein!" % ", ".join(sorted([str(x) for x in residues_selected]))) if not os.path.exists("rmsf.npy"): avg_xyz = None global_traj = None trajectory_name = "_%d".join(os.path.splitext(trajectory_name)) epochs = utilities.get_epoch_folders(sim_path) n_epochs = len(epochs) for epoch in epochs: with open(os.path.join(sim_path, epoch, "topologyMapping.txt")) as f: top_map = f.read().rstrip().split(":") for i in range(1, n_trajs + 1): print("Processing epoch", epoch, "trajectory", i) trajectory = md.load(os.path.join(epoch, trajectory_name % i), top=os.path.join( sim_path, "topologies", "topology_%s.pdb" % top_map[i - 1])) if global_traj is None: avg_xyz = np.mean(trajectory.xyz, axis=0) global_traj = trajectory.remove_solvent() else: avg_xyz += np.mean(trajectory.xyz, axis=0) global_traj += trajectory.remove_solvent() avg_xyz /= (n_epochs * n_trajs) rmsfs = [] for i, ind in enumerate(selections): temp = 10 * np.sqrt(3 * np.mean( (global_traj.xyz[:, ind, :] - avg_xyz[ind, :])**2, axis=(1, 2))) rmsfs.append(np.mean(temp)) np.save("rmsf.npy", rmsfs) else: rmsfs = np.load("rmsf.npy") f1, ax1 = plt.subplots(1, 1) # get axis size in inches width = ax1.get_window_extent().transformed( f1.dpi_scale_trans.inverted()).width # font size is assumed to be 12pt and 1pt is 1/72in font = 12 * 1 / 72.0 # if there are less labels that the max that would fit, show them all n_ticks = max(1, len(labels) // int(np.floor(width / font))) print(width, font, n_ticks, len(labels), width / font) x_vals = np.array(range(len(labels))) ax1.plot(rmsfs, 'x-') ax1.set_xticks(x_vals[::n_ticks]) ax1.set_xticklabels(labels[::n_ticks]) ax1.set_ylabel(r"RMSF ($\AA$)") ax1.tick_params(axis='x', rotation=90, labelsize=10) if plot_name is not None: f1.savefig(plot_name) plt.show()
def main(folder_name=".", atom_Ids="", lig_resname="", numtotalSteps=0, enforceSequential_run=0, writeLigandTrajectory=True, setNumber=0, protein_CA=0, non_Repeat=False, nProcessors=None, parallelize=True, topology=None, sidechains=False, sidechain_folder=".", cm=False, use_extra_atoms=False, CM_mode="p-lig", calc_dihedrals=False, dihedrals_projection=False): params = ParamsHandler(folder_name, atom_Ids, lig_resname, numtotalSteps, enforceSequential_run, writeLigandTrajectory, setNumber, protein_CA, non_Repeat, nProcessors, parallelize, topology, sidechains, sidechain_folder, cm, use_extra_atoms, CM_mode, calc_dihedrals, dihedrals_projection) constants = Constants() if params.topology is not None: params.topology = utilities.getTopologyObject(params.topology) params.lig_resname = parseResname(params.atomIds, params.lig_resname, params.contact_map, params.cm_mode, params.dihedrals) folderWithTrajs = params.folder_name makeGatheredTrajsFolder(constants) if params.enforceSequential_run: folders = ["."] else: folders = utilities.get_epoch_folders(folderWithTrajs) if len(folders) == 0: folders = ["."] # if multiprocess is not available, turn off parallelization params.parallelize &= PARALELLIZATION if params.parallelize: if params.nProcessors is None: params.nProcessors = utilities.getCpuCount() params.nProcessors = max(1, params.nProcessors) print("Running extractCoords with %d cores" % (params.nProcessors)) pool = mp.Pool(params.nProcessors) else: pool = None params.sidechains = extractSidechainIndexes( params, pool=pool) if params.sidechains else [] for folder_it in folders: pathFolder = os.path.join(folderWithTrajs, folder_it) print("Extracting coords from folder %s" % folder_it) ligand_trajs_folder = os.path.join(pathFolder, constants.ligandTrajectoryFolder) if params.writeLigandTrajectory and not os.path.exists( ligand_trajs_folder): os.makedirs(ligand_trajs_folder) writeFilenamesExtractedCoordinates(pathFolder, params, constants, pool=pool) if not params.non_Repeat: print("Repeating snapshots from folder %s" % folder_it) repeatExtractedSnapshotsInFolder(pathFolder, constants, params.numtotalSteps, pool=None) print("Gathering trajs in %s" % constants.gatherTrajsFolder) gatherTrajs(constants, folder_it, params.setNumber, params.non_Repeat)
def main(num_clusters, criteria1, criteria2, output_folder, ligand_resname, atom_ids, cpus=2, topology=None, report="report_", traj="trajectory_", use_pdb=False): if not glob.glob("*/extractedCoordinates/coord_*"): extractCoords.main(lig_resname=ligand_resname, non_Repeat=True, atom_Ids=atom_ids, nProcessors=cpus, parallelize=False, topology=topology, use_pdb=use_pdb) trajectoryFolder = "allTrajs" trajectoryBasename = "*traj*" stride = 1 clusterCountsThreshold = 0 folders = utilities.get_epoch_folders(".") folders.sort(key=int) clusteringObject = cluster.Cluster(num_clusters, trajectoryFolder, trajectoryBasename, alwaysCluster=True, stride=stride) clusteringObject.clusterTrajectories() clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold) clusterCenters = clusteringObject.clusterCenters centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename, num_clusters, clusterCenters) COMArray = [centersInfo[i]['center'] for i in xrange(num_clusters)] fields1 = [] fields2 = [] for cluster_num in centersInfo: epoch_num, traj_num, snap_num = map( int, centersInfo[cluster_num]['structure']) field1, crit1_name = get_metric(criteria1, epoch_num, traj_num, snap_num, report) field2, crit2_name = get_metric(criteria2, epoch_num, traj_num, snap_num, report) fields1.append(field1) fields2.append(field2) if output_folder is not None: outputFolder = os.path.join(output_folder, "") if not os.path.exists(outputFolder): os.makedirs(outputFolder) else: outputFolder = "" writePDB( COMArray, outputFolder + "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters) writeInitialStructures(fields1, fields2, crit1_name, crit2_name, centersInfo, outputFolder + "cluster_{}_{}_{}_{}_{}.pdb", traj, topology=topology, use_pdb=use_pdb) plotClusters(fields1, fields2, crit1_name, crit2_name, outputFolder)
def main(nTICs, numClusters, ligand_resname, lag, nTraj, n_steps, out_path=None, stride_conformations=1, atomId="", repeat=False, plotTICA=False, topology=None): # Constants definition trajectoryFolder = "tica_projected_trajs" trajectoryBasename = "tica_traj*" stride = 1 clusterCountsThreshold = 0 clustersCentersFolder = "clustersCenters" ticaObject = "tica.pkl" if out_path is None: folderPath = "" curr_folder = "." else: folderPath = out_path curr_folder = out_path folders = utilities.get_epoch_folders(curr_folder) folders.sort(key=int) if not os.path.exists( os.path.join(folderPath, "0/repeatedExtractedCoordinates/")) or repeat: # Extract ligand and alpha carbons coordinates extractCoords.main(folder_name=curr_folder, lig_resname=ligand_resname, numtotalSteps=n_steps, protein_CA=False, non_Repeat=False, sidechains=True, sidechain_folder="../output_clustering/initial*", enforceSequential_run=0, nProcessors=1) tica = make_TICA_decomposition(ticaObject, folders, folderPath, lag) # Select the desired number of independent components from the full # decomposition projected = tica.get_output(dimensions=list(range(nTICs))) write_TICA_trajs(trajectoryFolder, projected, trajectoryBasename, folders, nTraj) clusteringObject = cluster_TICA_space(numClusters, trajectoryFolder, trajectoryBasename, stride, clusterCountsThreshold) trajsUniq, projectedUniq = projectTICATrajs(folders, folderPath, ligand_resname, atomId, stride_conformation, nTICs, tica, topology=topology) clusterCenters = clusteringObject.clusterCenters dtrajs = clusteringObject.assignNewTrajectories(projectedUniq) centersInfo = find_representative_strucutures(folders, numClusters, nTraj, clusterCenters, projectedUniq, dtrajs) writeCentersInfo(centersInfo, folderPath, ligand_resname, nTICs, numClusters, trajsUniq, clustersCentersFolder, nTraj, topology=topology) if plotTICA: make_TICA_plot(nTICs, projected)
def main(n_clusters, output_folder, SASAColumn, norm_energy, num_bins, percentile, plots, atom_Ids, folder_name, traj_basename, cluster_energy, topology=None): energyColumn = 3 if output_folder is not None: outputFolder = os.path.join(output_folder, "") if not os.path.exists(outputFolder): os.makedirs(outputFolder) else: outputFolder = "" extractCoords.main(folder_name, lig_resname=ligand_resname, non_Repeat=True, atom_Ids=atom_Ids) epochFolders = utilities.get_epoch_folders(folder_name) points = [] for epoch in epochFolders: report_files = glob.glob(os.path.join(epoch, "*report*")) report_files.sort(key=lambda x: int(x[x.rfind("_") + 1:])) for report_name in report_files: traj_num = int(report_name[report_name.rfind("_") + 1:]) coordinates = np.loadtxt( os.path.join( folder_name, "%s/extractedCoordinates/coord_%d.dat" % (epoch, traj_num))) report = np.loadtxt(report_name) if len(report.shape) < 2: points.append([ report[energyColumn], report[SASAColumn], int(epoch), traj_num, 0 ] + coordinates[1:].tolist()) else: epoch_line = np.array([int(epoch)] * report.shape[0]) traj_line = np.array([traj_num] * report.shape[0]) snapshot_line = np.array(range(report.shape[0])) points.extend( np.hstack( (report[:, (energyColumn, SASAColumn)], epoch_line[:, np.newaxis], traj_line[:, np.newaxis], snapshot_line[:, np.newaxis], coordinates[:, 1:]))) points = np.array(points) points = points[points[:, 1].argsort()] minSASA = points[0, 1] maxSASA = points[-1, 1] left_bins = np.linspace(minSASA, maxSASA, num=num_bins, endpoint=False) indices = np.searchsorted(points[:, 1], left_bins) thresholds = np.array([ np.percentile(points[i:j, 0], percentile) for i, j in zip(indices[:-1], indices[1:]) ]) new_points = [] occupation = [] for ij, (i, j) in enumerate(zip(indices[:-1], indices[1:])): found = np.where(points[i:j, 0] < thresholds[ij])[0] occupation.append(len(found)) if len(found) == 1: new_points.append(points[found + i]) elif len(found) > 1: new_points.extend(points[found + i]) points = np.array(new_points) if norm_energy: energyMin = points.min(axis=0)[0] points[:, 0] -= energyMin energyMax = points.max(axis=0)[0] points[:, 0] /= energyMax if cluster_energy: print("Clustering using energy and SASA") kmeans = KMeans(n_clusters=n_clusters).fit(points[:, :2]) title = "clusters_%d_energy_SASA.pdb" else: print("Clustering using ligand coordinates") kmeans = KMeans(n_clusters=n_clusters).fit(points[:, 5:8]) title = "clusters_%d_energy_SASA_coords.pdb" centers_energy = [] centers_coords = [] if topology is not None: topology_contents = utilities.getTopologyFile(topology) else: topology = None for i, center in enumerate(kmeans.cluster_centers_): if cluster_energy: dist = np.linalg.norm((points[:, :2] - center), axis=1) else: dist = np.linalg.norm((points[:, 5:8] - center), axis=1) epoch, traj, snapshot = points[dist.argmin(), 2:5] centers_energy.append(points[dist.argmin(), :2]) centers_coords.append(points[dist.argmin(), 5:8]) traj_file = glob.glob("%d/%s_%d*" % (epoch, traj_basename, traj))[0] conf = utilities.getSnapshots(traj_file, topology=topology)[int(snapshot)] if isinstance(conf, basestring): with open(os.path.join(outputFolder, "initial_%d.pdb" % i), "w") as fw: fw.write(conf) else: utilities.write_mdtraj_object_PDB( conf, os.path.join(outputFolder, "initial_%d.pdb" % i), topology_contents) centers_energy = np.array(centers_energy) centers_coords = np.array(centers_coords) writePDB(centers_coords, os.path.join(outputFolder, title % n_clusters)) if plots: plt.scatter(points[:, 1], points[:, 0], c=kmeans.labels_, alpha=0.5) plt.scatter(centers_energy[:, 1], centers_energy[:, 0], c=list(range(n_clusters)), marker='x', s=56, zorder=1) plt.xlabel("SASA") if norm_energy: plt.ylabel("Energy (normalized)") plt.savefig( os.path.join(outputFolder, "clusters_energy_normalized.png")) else: plt.ylabel("Energy (kcal/mol)") plt.savefig( os.path.join(outputFolder, "clusters_no_normalized.png")) plt.show()