Python perform_DBSCAN Examples, clustering.perform_DBSCAN Python Examples

Example #1

0

Show file

        eRMSDs = get_eRMSDs(r1, r2, inputfile, traj_file, top_file, num_confs)
    if parallel:
        out = parallelize.fire_multiprocess(traj_file,
                                            top_file,
                                            get_eRMSDs,
                                            num_confs,
                                            n_cpus,
                                            r2,
                                            inputfile,
                                            traj_file,
                                            top_file,
                                            matrix=True)
        eRMSDs = np.sum((i for i in out), axis=0)
    #eRMSDs = pickle.load(open('tmp_eRMSDs', 'rb'))

    #the eRMSD matrix is actually only half a matrix
    for ni, i in enumerate(eRMSDs):
        for nj, j in enumerate(i):
            eRMSDs[nj][ni] = j
            if ni == nj:
                eRMSDs[ni][nj] = 0

    #since calculating the eRMSDs are so time-consuming to calculate we're gonna pickle it to iterate the DBSCAN later.
    with open("tmp_eRMSDs", "wb") as file:
        pickle.dump(eRMSDs, file)

    ###############################################################################################################
    #Next, we're going to perform a DBSCAN on that matrix of eRMSDs to find clusters of similar structures
    perform_DBSCAN(eRMSDs, num_confs, traj_file, inputfile)

    ##############################################################################################################

Example #2

0

Show file

File: eRMSD.py Project: sulcgroup/oxdna_analysis_tools

        eRMSDs = get_eRMSDs(r1, r2, inputfile, traj_file, top_file, num_confs)
    if parallel:
        out = parallelize_lorenzo_onefile.fire_multiprocess(traj_file,
                                                            top_file,
                                                            get_eRMSDs,
                                                            num_confs,
                                                            n_cpus,
                                                            r2,
                                                            inputfile,
                                                            traj_file,
                                                            top_file,
                                                            matrix=True)
        eRMSDs = np.sum((i for i in out), axis=0)
    #eRMSDs = pickle.load(open('tmp_eRMSDs', 'rb'))

    #the eRMSD matrix is actually only half a matrix
    for ni, i in enumerate(eRMSDs):
        for nj, j in enumerate(i):
            eRMSDs[nj][ni] = j
            if ni == nj:
                eRMSDs[ni][nj] = 0

    #since calculating the eRMSDs are so time-consuming to calculate we're gonna pickle it to iterate the DBSCAN later.
    with open("tmp_eRMSDs", "wb") as file:
        pickle.dump(eRMSDs, file)

    ###############################################################################################################
    #Next, we're going to perform a DBSCAN on that matrix of eRMSDs to find clusters of similar structures
    perform_DBSCAN(eRMSDs, num_confs, traj_file, inputfile, "precomputed")

    ##############################################################################################################

Example #3

0

Show file

    
    #Create an oxView overlay showing the first SUM components
    SUM = 1
    print("INFO: Change the number of eigenvalues to sum and display by modifying the SUM variable in the script.  Current value: {}".format(SUM), file=stderr)
    weighted_sum = np.zeros_like(evectors[0])
    for i in range(0, SUM): #how many eigenvalues do you want?
        weighted_sum += evalues[i]*evectors[i]

    prep_pos_for_json = lambda conf: list(
                            list(p) for p in conf
                        )
    with catch_warnings(): #this produces an annoying warning about casting complex values to real values that is not relevant
        simplefilter("ignore")
        output_vectors = weighted_sum.reshape(int(weighted_sum.shape[0]/3), 3).astype(float)
    with open(outfile, "w+") as file:
        file.write(dumps({
            "pca" : prep_pos_for_json(output_vectors)
        }))

    #If we're running clustering, feed the linear terms into the clusterer
    if cluster:
        print("INFO: Mapping configurations to component space...", file=stderr)

        #If you want to cluster on only some of the components, uncomment this
        #out = out[:,0:3]

        from clustering import perform_DBSCAN
        labs = perform_DBSCAN(out, num_confs, traj_file, inputfile, "euclidean")

Example #4

0

Show file

            if outfile.split(".")[1] != "json":
                raise Exception
            f = outfile.split(".")[0] + str(i) + "." + outfile.split(".")[1]
        except:
            print(
                "ERROR: oxView overlays must have a '.json' extension.  No overlays will be produced",
                file=stderr)
            break
        out = np.sqrt(evalues[i]) * evectors[i]

        with catch_warnings(
        ):  #this produces an annoying warning about casting complex values to real values that is not relevant
            simplefilter("ignore")
            output_vectors = out.reshape(int(out.shape[0] / 3),
                                         3).astype(float)

        with open(f, "w+") as file:
            file.write(dumps({"pca": prep_pos_for_json(output_vectors)}))

    #If we're running clustering, feed the linear terms into the clusterer
    if cluster:
        print("INFO: Mapping configurations to component space...",
              file=stderr)

        #If you want to cluster on only some of the components, uncomment this
        #out = out[:,0:3]

        from clustering import perform_DBSCAN
        labs = perform_DBSCAN(coordinates, num_confs, traj_file, inputfile,
                              "euclidean")

Example #5

0

Show file

File: distance.py Project: liphlab/oxdna_analysis_tools

    if lineplt == True:
        if hist == True:
            #clear the histogram plot
            plt.clf()
            #if making two plots, automatically append the plot type to the output file name
            out = outfile[:outfile.find(".")] + "_traj" + outfile[outfile.
                                                                  find("."):]
        else:
            out = outfile
        graph_count = 0
        for traj_set in distances:
            for dist_list in traj_set:
                a = plt.plot(dist_list, alpha=0.5, label=names[graph_count])
                graph_count += 1
        plt.xlabel("Simulation Steps")
        plt.ylabel("Distance (nm)")
        plt.legend()
        #plt.show()
        print("INFO: Writing trajectory plot to file {}".format(out),
              file=stderr)
        plt.savefig("{}".format(out))

    if cluster == True:
        if not all([x == trajectories[0] for x in trajectories]):
            print("ERROR: Clustering can only be run on a single trajectory",
                  file=stderr)
            exit(1)
        from clustering import perform_DBSCAN

        labs = perform_DBSCAN(distances[0].T, len(distances[0][0]),
                              trajectories[0], input_files[0], "euclidean")

Example #6

0

Show file

def main():
    parser = argparse.ArgumentParser(
        prog=path.basename(__file__),
        description=
        "Calculates a principal component analysis of nucleotide deviations over a trajectory"
    )
    parser.add_argument('inputfile',
                        type=str,
                        nargs=1,
                        help="The inputfile used to run the simulation")
    parser.add_argument('trajectory',
                        type=str,
                        nargs=1,
                        help='the trajectory file you wish to analyze')
    parser.add_argument(
        'meanfile',
        type=str,
        nargs=1,
        help='The mean structure .json file from compute_mean.py')
    parser.add_argument(
        'outfile',
        type=str,
        nargs=1,
        help='the name of the .json file where the PCA will be written')
    parser.add_argument('-p',
                        metavar='num_cpus',
                        nargs=1,
                        type=int,
                        dest='parallel',
                        help="(optional) How many cores to use")
    parser.add_argument(
        '-c',
        metavar='cluster',
        dest='cluster',
        action='store_const',
        const=True,
        default=False,
        help="Run the clusterer on each configuration's position in PCA space?"
    )
    args = parser.parse_args()

    check_dependencies(["python", "numpy", "Bio"])

    traj_file = args.trajectory[0]
    inputfile = args.inputfile[0]
    mean_file = args.meanfile[0]
    outfile = args.outfile[0]
    parallel = args.parallel
    if parallel:
        n_cpus = args.parallel[0]
    #-c makes it run the clusterer on the output
    cluster = args.cluster
    top_file = get_input_parameter(inputfile, "topology")
    if "RNA" in get_input_parameter(inputfile, "interaction_type"):
        environ["OXRNA"] = "1"
    else:
        environ["OXRNA"] = "0"
    import UTILS.base  #this needs to be imported after the model type is set

    num_confs = cal_confs(traj_file)

    if mean_file.split(".")[-1] == "json":
        with open(mean_file) as file:
            align_conf = load(file)['g_mean']

    elif mean_file.split(".")[-1] == "dat":
        fetch_np = lambda conf: np.array([n.cm_pos for n in conf._nucleotides])
        with LorenzoReader2(mean_file, top_file) as reader:
            s = reader._get_system()
            align_conf = fetch_np(s)

    cms = np.mean(align_conf,
                  axis=0)  #all structures must have the same center of mass
    align_conf -= cms

    #Compute the deviations
    if not parallel:
        r = LorenzoReader2(traj_file, top_file)
        deviations_matrix = get_pca(r, align_conf, num_confs)

    if parallel:
        out = parallelize_lorenzo_onefile.fire_multiprocess(
            traj_file, top_file, get_pca, num_confs, n_cpus, align_conf)
        deviations_matrix = np.concatenate([i for i in out])

    #now that we have the deviations matrix we're gonna get the covariance and PCA it
    #note that in the future we might want a switch for covariance vs correlation matrix because correlation (cov/stdev so all diagonals are 1) is better for really floppy structures
    pca = PCA(n_components=3)
    pca.fit(deviations_matrix)
    transformed = pca.transform(deviations_matrix)

    #THIS IS AS FAR AS I GOT

    import matplotlib.pyplot as plt
    print("INFO: Saving scree plot to scree.png", file=stderr)
    plt.scatter(range(0, len(evalues)), evalues, s=25)
    plt.xlabel("component")
    plt.ylabel("eigenvalue")
    plt.savefig("scree.png")

    print(
        "INFO: Creating coordinate plot from first three eigenvectors.  Saving to coordinates.png",
        file=stderr)
    #if you want to weight the components by their eigenvectors
    #mul = np.einsum('ij,i->ij',evectors[0:3], evalues[0:3])
    mul = evectors

    #reconstruct configurations in component space
    out = np.dot(deviations_matrix, mul).astype(float)

    #make a quick plot from the first three components
    from mpl_toolkits.mplot3d import Axes3D
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    ax.scatter(out[:, 0], out[:, 1], out[:, 2], c='g', s=25)
    plt.savefig("coordinates.png")

    #Create an oxView overlay showing the first SUM components
    SUM = 1
    print(
        "INFO: Change the number of eigenvalues to sum and display by modifying the SUM variable in the script.  Current value: {}"
        .format(SUM),
        file=stderr)
    weighted_sum = np.zeros_like(evectors[0])
    for i in range(0, SUM):  #how many eigenvalues do you want?
        weighted_sum += evalues[i] * evectors[i]

    prep_pos_for_json = lambda conf: list(list(p) for p in conf)
    with catch_warnings(
    ):  #this produces an annoying warning about casting complex values to real values that is not relevant
        simplefilter("ignore")
        output_vectors = weighted_sum.reshape(int(weighted_sum.shape[0] / 3),
                                              3).astype(float)
    with open(outfile, "w+") as file:
        file.write(dumps({"pca": prep_pos_for_json(output_vectors)}))

    #If we're running clustering, feed the linear terms into the clusterer
    if cluster:
        print("INFO: Mapping configurations to component space...",
              file=stderr)

        #If you want to cluster on only some of the components, uncomment this
        #out = out[:,0:3]

        from clustering import perform_DBSCAN
        labs = perform_DBSCAN(out, num_confs, traj_file, inputfile,
                              "euclidean", 12, 8)