예제 #1
0
def main(num_clusters, output_folder, ligand_resname, atom_ids, traj_folder):

    extractCoords.main(folder_name=traj_folder,
                       lig_resname=ligand_resname,
                       non_Repeat=True,
                       atom_Ids=atom_ids)
    trajectoryFolder = "allTrajs"
    trajectoryBasename = "*trajectory*"
    stride = 1
    clusterCountsThreshold = 0

    clusteringObject = cluster.Cluster(num_clusters,
                                       trajectoryFolder,
                                       trajectoryBasename,
                                       alwaysCluster=False,
                                       stride=stride)
    clusteringObject.clusterTrajectories()
    clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold)
    clusterCenters = clusteringObject.clusterCenters

    centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename,
                                   num_clusters, clusterCenters)
    COMArray = [centersInfo[i]['center'] for i in xrange(num_clusters)]
    if output_folder is not None:
        outputFolder = os.path.join(traj_folder, output_folder)
        if not os.path.exists(outputFolder):
            os.makedirs(outputFolder)
    else:
        outputFolder = ""
    writePDB(
        COMArray,
        os.path.join(outputFolder,
                     "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters))
    writeInitialStructures(centersInfo, outputFolder, traj_folder)
    return trajectoryFolder, "discretized"
예제 #2
0
def main(num_clusters,
         output_folder,
         ligand_resname,
         atom_ids,
         folder_name=".",
         topology=None):
    extractCoords.main(folder_name,
                       lig_resname=ligand_resname,
                       non_Repeat=True,
                       atom_Ids=atom_ids)
    trajectoryFolder = "allTrajs"
    trajectoryBasename = "traj*"
    stride = 1
    clusterCountsThreshold = 0

    folders = utilities.get_epoch_folders(folder_name)
    folders.sort(key=int)

    if os.path.exists("discretized"):
        # If there is a previous clustering, remove to cluster again
        shutil.rmtree("discretized")
    clusteringObject = cluster.Cluster(num_clusters,
                                       trajectoryFolder,
                                       trajectoryBasename,
                                       alwaysCluster=False,
                                       stride=stride)
    clusteringObject.clusterTrajectories()
    clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold)
    clusterCenters = clusteringObject.clusterCenters

    centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename,
                                   num_clusters, clusterCenters)
    COMArray = [centersInfo[i]['center'][:3] for i in range(num_clusters)]
    if output_folder is not None:
        outputFolder = os.path.join(output_folder, "")
        if not os.path.exists(outputFolder):
            os.makedirs(outputFolder)
    else:
        outputFolder = ""
    writePDB(
        COMArray,
        outputFolder + "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters)
    writeInitialStructures(centersInfo,
                           outputFolder + "initial_%d.pdb",
                           topology=topology)
def main(num_clusters,
         criteria1,
         criteria2,
         ligand_resname,
         output_folder="ClusterCentroids",
         atom_ids="",
         cpus=2,
         topology=None,
         report="report_",
         traj="trajectory_",
         use_pdb=False,
         png=False,
         CA=0,
         sidechains=0,
         restart="all"):
    # Create multiprocess pool
    if cpus > 1:
        pool = mp.Pool(cpus)
    else:
        pool = mp.Pool(1)
    # Extract COM ligand for each snapshot
    if not glob.glob("allTrajs/traj*"):
        extractCoords.main(lig_resname=ligand_resname,
                           non_Repeat=True,
                           atom_Ids=atom_ids,
                           nProcessors=cpus,
                           parallelize=True,
                           topology=topology,
                           protein_CA=CA,
                           sidechains=sidechains)

    print("Clusterize trajectories by RMSD of COM")
    trajectoryFolder = "allTrajs"
    trajectoryBasename = "*traj*"
    stride = 1
    clusterCountsThreshold = 0
    folders = utilities.get_epoch_folders(".")
    folders.sort(key=int)
    if not restart:

        clusteringObject = cluster.Cluster(num_clusters,
                                           trajectoryFolder,
                                           trajectoryBasename,
                                           alwaysCluster=True,
                                           stride=stride)
        clusteringObject.clusterTrajectories()
        clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold)
        clusterCenters = clusteringObject.clusterCenters
        np.savetxt("clustercenters.dat", clusterCenters)
        dtrajs = clusteringObject.dtrajs

        print("Extract metrics for each snapshot")
        min_metric_trajs = {}
        epochs = [folder for folder in glob.glob("./*/") if folder.isdigit()]
        reports = simulationToCsv.gather_reports()
        fields = simulationToCsv.retrieve_fields(reports[0])
        df = simulationToCsv.init_df(fields)
        df = simulationToCsv.fill_data(reports, df, pool)

        print("Update data with metrics and clusters")
        df.index = range(df.shape[0])
        df["Cluster"] = [None] * df.shape[0]
        input_list = [[
            df, Traj, d
        ] for d, Traj in zip(dtrajs, clusteringObject.trajFilenames)]
        results = pool.map(save_to_df, input_list)
        for data in results:
            for df_tmp in data:
                df.update(df_tmp)
        df.to_csv("Simulation.csv", index=False)
    if restart:
        df = pd.read_csv("Simulation.csv")
        clusterCenters = utilities.loadtxtfile("clustercenters.dat")
        print(clusterCenters)
    centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename,
                                   num_clusters, clusterCenters)
    COMArray = [centersInfo[i]['center'] for i in range(num_clusters)]

    print("Retrieve clusters and metric")
    fields1 = []
    fields2 = []
    print(centersInfo)
    for cluster_num in centersInfo:
        epoch_num, traj_num, snap_num = map(
            int, centersInfo[cluster_num]['structure'])
        field1, crit1_name = get_metric(criteria1, epoch_num, traj_num,
                                        snap_num, report)
        field2, crit2_name = get_metric(criteria2, epoch_num, traj_num,
                                        snap_num, report)
        fields1.append(field1)
        fields2.append(field2)

    if output_folder is not None:
        outputFolder = os.path.join(output_folder, "")
        if not os.path.exists(outputFolder):
            os.makedirs(outputFolder)
    else:
        outputFolder = ""
    print("Output structures")
    writePDB(
        COMArray,
        outputFolder + "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters)
    writeInitialStructures(fields1,
                           fields2,
                           crit1_name,
                           crit2_name,
                           centersInfo,
                           outputFolder + "cluster_{}_{}_{}_{}_{}.pdb",
                           traj,
                           topology=topology,
                           use_pdb=use_pdb)
    plotClusters(fields1,
                 fields2,
                 crit1_name,
                 crit2_name,
                 outputFolder,
                 png=png)
    assesClusterConvergence(df, num_clusters, traj, topology)
    return
예제 #4
0
def main(nTICs,
         numClusters,
         ligand_resname,
         lag,
         nTraj,
         n_steps,
         out_path=None,
         stride_conformations=1,
         atomId="",
         repeat=False,
         plotTICA=False,
         topology=None):
    # Constants definition
    trajectoryFolder = "tica_projected_trajs"
    trajectoryBasename = "tica_traj*"
    stride = 1
    clusterCountsThreshold = 0
    clustersCentersFolder = "clustersCenters"
    ticaObject = "tica.pkl"

    if out_path is None:
        folderPath = ""
        curr_folder = "."
    else:
        folderPath = out_path
        curr_folder = out_path

    folders = utilities.get_epoch_folders(curr_folder)
    folders.sort(key=int)

    if not os.path.exists(
            os.path.join(folderPath,
                         "0/repeatedExtractedCoordinates/")) or repeat:
        # Extract ligand and alpha carbons coordinates
        extractCoords.main(folder_name=curr_folder,
                           lig_resname=ligand_resname,
                           numtotalSteps=n_steps,
                           protein_CA=False,
                           non_Repeat=False,
                           sidechains=True,
                           sidechain_folder="../output_clustering/initial*",
                           enforceSequential_run=0,
                           nProcessors=1)

    tica = make_TICA_decomposition(ticaObject, folders, folderPath, lag)

    # Select the desired number of independent components from the full
    # decomposition
    projected = tica.get_output(dimensions=list(range(nTICs)))
    write_TICA_trajs(trajectoryFolder, projected, trajectoryBasename, folders,
                     nTraj)
    clusteringObject = cluster_TICA_space(numClusters, trajectoryFolder,
                                          trajectoryBasename, stride,
                                          clusterCountsThreshold)
    trajsUniq, projectedUniq = projectTICATrajs(folders,
                                                folderPath,
                                                ligand_resname,
                                                atomId,
                                                stride_conformation,
                                                nTICs,
                                                tica,
                                                topology=topology)

    clusterCenters = clusteringObject.clusterCenters
    dtrajs = clusteringObject.assignNewTrajectories(projectedUniq)
    centersInfo = find_representative_strucutures(folders, numClusters, nTraj,
                                                  clusterCenters,
                                                  projectedUniq, dtrajs)
    writeCentersInfo(centersInfo,
                     folderPath,
                     ligand_resname,
                     nTICs,
                     numClusters,
                     trajsUniq,
                     clustersCentersFolder,
                     nTraj,
                     topology=topology)
    if plotTICA:
        make_TICA_plot(nTICs, projected)
예제 #5
0
def main(n_clusters,
         output_folder,
         SASAColumn,
         norm_energy,
         num_bins,
         percentile,
         plots,
         atom_Ids,
         folder_name,
         traj_basename,
         cluster_energy,
         topology=None):
    energyColumn = 3

    if output_folder is not None:
        outputFolder = os.path.join(output_folder, "")
        if not os.path.exists(outputFolder):
            os.makedirs(outputFolder)
    else:
        outputFolder = ""

    extractCoords.main(folder_name,
                       lig_resname=ligand_resname,
                       non_Repeat=True,
                       atom_Ids=atom_Ids)

    epochFolders = utilities.get_epoch_folders(folder_name)
    points = []
    for epoch in epochFolders:
        report_files = glob.glob(os.path.join(epoch, "*report*"))
        report_files.sort(key=lambda x: int(x[x.rfind("_") + 1:]))
        for report_name in report_files:
            traj_num = int(report_name[report_name.rfind("_") + 1:])
            coordinates = np.loadtxt(
                os.path.join(
                    folder_name, "%s/extractedCoordinates/coord_%d.dat" %
                    (epoch, traj_num)))
            report = np.loadtxt(report_name)
            if len(report.shape) < 2:
                points.append([
                    report[energyColumn], report[SASAColumn],
                    int(epoch), traj_num, 0
                ] + coordinates[1:].tolist())
            else:
                epoch_line = np.array([int(epoch)] * report.shape[0])
                traj_line = np.array([traj_num] * report.shape[0])
                snapshot_line = np.array(range(report.shape[0]))
                points.extend(
                    np.hstack(
                        (report[:, (energyColumn, SASAColumn)],
                         epoch_line[:, np.newaxis], traj_line[:, np.newaxis],
                         snapshot_line[:, np.newaxis], coordinates[:, 1:])))
    points = np.array(points)
    points = points[points[:, 1].argsort()]
    minSASA = points[0, 1]
    maxSASA = points[-1, 1]
    left_bins = np.linspace(minSASA, maxSASA, num=num_bins, endpoint=False)
    indices = np.searchsorted(points[:, 1], left_bins)
    thresholds = np.array([
        np.percentile(points[i:j, 0], percentile)
        for i, j in zip(indices[:-1], indices[1:])
    ])

    new_points = []
    occupation = []
    for ij, (i, j) in enumerate(zip(indices[:-1], indices[1:])):
        found = np.where(points[i:j, 0] < thresholds[ij])[0]
        occupation.append(len(found))
        if len(found) == 1:
            new_points.append(points[found + i])
        elif len(found) > 1:
            new_points.extend(points[found + i])

    points = np.array(new_points)
    if norm_energy:
        energyMin = points.min(axis=0)[0]
        points[:, 0] -= energyMin
        energyMax = points.max(axis=0)[0]
        points[:, 0] /= energyMax

    if cluster_energy:
        print("Clustering using energy and SASA")
        kmeans = KMeans(n_clusters=n_clusters).fit(points[:, :2])
        title = "clusters_%d_energy_SASA.pdb"
    else:
        print("Clustering using ligand coordinates")
        kmeans = KMeans(n_clusters=n_clusters).fit(points[:, 5:8])
        title = "clusters_%d_energy_SASA_coords.pdb"
    centers_energy = []
    centers_coords = []
    if topology is not None:
        topology_contents = utilities.getTopologyFile(topology)
    else:
        topology = None
    for i, center in enumerate(kmeans.cluster_centers_):
        if cluster_energy:
            dist = np.linalg.norm((points[:, :2] - center), axis=1)
        else:
            dist = np.linalg.norm((points[:, 5:8] - center), axis=1)
        epoch, traj, snapshot = points[dist.argmin(), 2:5]
        centers_energy.append(points[dist.argmin(), :2])
        centers_coords.append(points[dist.argmin(), 5:8])
        traj_file = glob.glob("%d/%s_%d*" % (epoch, traj_basename, traj))[0]
        conf = utilities.getSnapshots(traj_file,
                                      topology=topology)[int(snapshot)]
        if isinstance(conf, basestring):
            with open(os.path.join(outputFolder, "initial_%d.pdb" % i),
                      "w") as fw:
                fw.write(conf)
        else:
            utilities.write_mdtraj_object_PDB(
                conf, os.path.join(outputFolder, "initial_%d.pdb" % i),
                topology_contents)
    centers_energy = np.array(centers_energy)
    centers_coords = np.array(centers_coords)
    writePDB(centers_coords, os.path.join(outputFolder, title % n_clusters))
    if plots:
        plt.scatter(points[:, 1], points[:, 0], c=kmeans.labels_, alpha=0.5)
        plt.scatter(centers_energy[:, 1],
                    centers_energy[:, 0],
                    c=list(range(n_clusters)),
                    marker='x',
                    s=56,
                    zorder=1)
        plt.xlabel("SASA")
        if norm_energy:
            plt.ylabel("Energy (normalized)")
            plt.savefig(
                os.path.join(outputFolder, "clusters_energy_normalized.png"))
        else:
            plt.ylabel("Energy (kcal/mol)")
            plt.savefig(
                os.path.join(outputFolder, "clusters_no_normalized.png"))
        plt.show()
def main(num_clusters,
         criteria1,
         criteria2,
         output_folder,
         ligand_resname,
         atom_ids,
         cpus=2,
         topology=None,
         report="report_",
         traj="trajectory_",
         use_pdb=False):
    if not glob.glob("*/extractedCoordinates/coord_*"):
        extractCoords.main(lig_resname=ligand_resname,
                           non_Repeat=True,
                           atom_Ids=atom_ids,
                           nProcessors=cpus,
                           parallelize=False,
                           topology=topology,
                           use_pdb=use_pdb)
    trajectoryFolder = "allTrajs"
    trajectoryBasename = "*traj*"
    stride = 1
    clusterCountsThreshold = 0
    folders = utilities.get_epoch_folders(".")
    folders.sort(key=int)

    clusteringObject = cluster.Cluster(num_clusters,
                                       trajectoryFolder,
                                       trajectoryBasename,
                                       alwaysCluster=True,
                                       stride=stride)
    clusteringObject.clusterTrajectories()
    clusteringObject.eliminateLowPopulatedClusters(clusterCountsThreshold)
    clusterCenters = clusteringObject.clusterCenters
    centersInfo = get_centers_info(trajectoryFolder, trajectoryBasename,
                                   num_clusters, clusterCenters)
    COMArray = [centersInfo[i]['center'] for i in xrange(num_clusters)]

    fields1 = []
    fields2 = []
    for cluster_num in centersInfo:
        epoch_num, traj_num, snap_num = map(
            int, centersInfo[cluster_num]['structure'])
        field1, crit1_name = get_metric(criteria1, epoch_num, traj_num,
                                        snap_num, report)
        field2, crit2_name = get_metric(criteria2, epoch_num, traj_num,
                                        snap_num, report)
        fields1.append(field1)
        fields2.append(field2)

    if output_folder is not None:
        outputFolder = os.path.join(output_folder, "")
        if not os.path.exists(outputFolder):
            os.makedirs(outputFolder)
    else:
        outputFolder = ""
    writePDB(
        COMArray,
        outputFolder + "clusters_%d_KMeans_allSnapshots.pdb" % num_clusters)
    writeInitialStructures(fields1,
                           fields2,
                           crit1_name,
                           crit2_name,
                           centersInfo,
                           outputFolder + "cluster_{}_{}_{}_{}_{}.pdb",
                           traj,
                           topology=topology,
                           use_pdb=use_pdb)
    plotClusters(fields1, fields2, crit1_name, crit2_name, outputFolder)