Example #1
0
    def test_batch_load(self):
        clusters =((Cluster(16,[16]), "data/training_clustering_1.bin"),
                   (Cluster(4,[4,5,6,7,8]), "data/training_clustering_2.bin"),
                   (Cluster(0,[0,1,2,3]), "data/training_clustering_3.bin"),
                   (Cluster(9,[9,10,11,12,13,14,15]), "data/training_clustering_4.bin"))
        
        # Creates 4 clusterings of 1 cluster
        filenames = []
        for cluster, filename in clusters:
            Clustering([cluster]).save_to_disk(filename)
            filenames.append(filename)
        
        # Then loads them and extracts its elements
        elements = []
        for filename in filenames:
            elements.extend(Clustering.load_from_disk(filename).get_all_clustered_elements())
        
        elements_batch = []
        clusterings_batch = Clustering.load_all_from_directory("data/")
        for clustering, filename in clusterings_batch:
            elements_batch.extend(clustering.get_all_clustered_elements())

        # And cleans the house
        for filename in filenames:
            os.system("rm "+filename)
            
        numpy.testing.assert_equal(sorted(elements), range(17))
        numpy.testing.assert_equal(sorted(elements_batch), range(17))
Example #2
0
 def test_get_all_clustered_elements(self):
     clusters =(
               Cluster(16,[16]),
               Cluster(4,[4,5,6,7,8]),
               Cluster(0,[0,1,2,3]),
               Cluster(9,[9,10,11,12,13,14,15])
               )
     clusterization = Clustering(clusters)
     self.assertItemsEqual(sorted( clusterization.get_all_clustered_elements()), range(17))
Example #3
0
 def test_remove_noise(self):
     clusters =(
               Cluster(16,[16]),
               Cluster(4,[4,5,6,7,8]),
               Cluster(0,[0,1,2,3]),
               Cluster(9,[9,10,11,12,13,14,15])
               )
     clusterization = Clustering(clusters)
     clusterization.eliminate_noise(5)
     self.assertEqual(len(clusterization.clusters), 2)
Example #4
0
 def test_remove_cluster(self):
     clusters =(
               Cluster(16,[16]),
               Cluster(4,[4,5,6,7,8]),
               Cluster(0,[0,1,2,3]),
               Cluster(9,[9,10,11,12,13,14,15])
               )
     clustering = Clustering(clusters)
     c = Cluster(0,[0,1,2,3])
     clustering.eliminate_cluster(c)
     self.assertEqual(len(clustering.clusters), 3)
Example #5
0
 def test_to_dic(self):
     clustering =Clustering([Cluster(16,[16]),
                              Cluster(9,[9,10,11,12,13,14,15]),
                              Cluster(0,[0,1,2,3]),
                              Cluster(4,[4,5,6,7,8])])
     self.assertDictEqual(clustering.to_dic(),
                          {'clusters': [{'prototype': 9, 'elements': '9:15', 'id': 'cluster_1'}, 
                                        {'prototype': 4, 'elements': '4:8', 'id': 'cluster_3'}, 
                                        {'prototype': 0, 'elements': '0:3', 'id': 'cluster_2'}, 
                                        {'prototype': 16, 'elements': '16', 'id': 'cluster_0'}], 
                           'total_number_of_elements': 17, 
                           'number_of_clusters': 4})
Example #6
0
 def test_get_percent_population_of_cluster(self):
     clusters =(
               Cluster(16,[16]),
               Cluster(4,[4,5,6,7,8]),
               Cluster(0,[0,1,2,3]),
               Cluster(9,[9,10,11,12,13,14,15])
               )
     clusterization = Clustering(clusters)
       
     total = 0
     for i in range(4):
         total = total + clusterization.get_population_percent_of_cluster(i)
     self.assertAlmostEqual(total, 100., 2)
Example #7
0
 def test_load_and_save_to_disk(self):
     clusters =(Cluster(16,[16]),
                Cluster(4,[4,5,6,7,8]),
                Cluster(0,[0,1,2,3]),
                Cluster(9,[9,10,11,12,13,14,15]))
     
     clustering = Clustering(clusters)
     before_saving_elements = clustering.get_all_clustered_elements()
     clustering.save_to_disk("data/saved_clustering_for_test")
     loaded_clustering = Clustering.load_from_disk("data/saved_clustering_for_test")
     after_saving_elements = loaded_clustering.get_all_clustered_elements()
     self.assertItemsEqual(before_saving_elements, after_saving_elements)
     os.system("rm data/saved_clustering_for_test")
Example #8
0
 def test_get_percent_of_n_clusters(self):
     clusters =(
               Cluster(16,[16]),
               Cluster(4,[4,5,6,7,8]),
               Cluster(0,[0,1,2,3]),
               Cluster(9,[9,10,11,12,13,14,15])
               )
     clusterization = Clustering(clusters)
       
     percents = clusterization.get_population_percent_of_n_bigger_clusters(3)
     expected_percents = [41.1764705882,29.4117647059,23.5294117647]
     for i in range(3):
         self.assertAlmostEqual(percents[i], expected_percents[i], 1)
Example #9
0
 def test_cluster_is_inside(self):
     clusters =(
               Cluster(16,[16]),
               Cluster(4,[4,5,6,7,8]),
               Cluster(0,[0,1,2,3]),
               Cluster(9,[9,10,11,12,13,14,15])
               )
     not_in_cluster= Cluster(17,[17,16])
     in_cluster = Cluster(0,[0,1,2,3])
     clusterization = Clustering(clusters)
     self.assertEqual(clusterization.cluster_index(not_in_cluster),-1)
     self.assertEqual(clusterization.cluster_index(in_cluster),2)
     self.assertEqual(clusterization.cluster_is_inside(not_in_cluster),False)
     self.assertEqual(clusterization.cluster_is_inside(in_cluster),True)
Example #10
0
 def test_number_of_clusters_needed_to_get_this_percent_of_elems(self):
     clusters =(
               Cluster(16,[16]),
               Cluster(4,[4,5,6,7,8]),
               Cluster(0,[0,1,2,3]),
               Cluster(9,[9,10,11,12,13,14,15])
               )
       
     clusterization = Clustering(clusters)
       
     self.assertEqual(clusterization.number_of_clusters_to_get_percent( 71),3)
     self.assertEqual(clusterization.number_of_clusters_to_get_percent( 70),2)
     self.assertEqual(clusterization.number_of_clusters_to_get_percent( 40),1)
     self.assertEqual(clusterization.number_of_clusters_to_get_percent( 42),2)
     self.assertEqual(clusterization.number_of_clusters_to_get_percent( 100),4)
Example #11
0
def get_best_clustering(results_file):
    """
    Loads and returns the best clustering from a results file.
    """
    results = convert_to_utf8(json.loads(open(results_file).read()))
    best_clustering_id =results["best_clustering"]
    best_clustering_dic = results["selected"][best_clustering_id]
    return Clustering.from_dic(best_clustering_dic["clustering"])
Example #12
0
 def test_classify(self):
     tags = ["A","B","C"]
     clusterings = [Clustering([], "this is of type A"),Clustering([], "this is of type B"),Clustering([], "this is of type C"),
                    Clustering([], "this is of type B"),Clustering([], "this is of type S"),Clustering([], "this is of type A"),
                    Clustering([], "this is of type A"),Clustering([], "this is of type C"),Clustering([], "this is of type D")]
     counter =  Clustering.classify(tags, clusterings)
     self.assertEqual(counter['A'], 3)
     self.assertEqual(counter['B'], 2)
     self.assertEqual(counter['C'], 2)
Example #13
0
 def test_gen_class_list(self):
     clusters =(
               Cluster(16,[16]),
               Cluster(4,[4,5,6,7,8]),
               Cluster(0,[0,1,2,3]),
               Cluster(9,[9,10,11,12,13,14,15])
               )
     clusterization = Clustering(clusters)
     class_list = clusterization.gen_class_list()
     expected_class_list = [2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3]
     self.assertItemsEqual(class_list, expected_class_list)
       
     clusters =(
               Cluster(0,[0,1,2,3]),
               Cluster(9,[9,10,11,12,13,14,15])
               )
     clusterization = Clustering(clusters)
     class_list = clusterization.gen_class_list()
     expected_class_list = [1, 1, 1, 1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0]
     self.assertItemsEqual(class_list, expected_class_list)
Example #14
0
 def testSilhouetteSpecialCase(self):
     clustering = Clustering.from_dic(data.clustering_01)
     mh = MatrixHandler({
                             "method": "load",
                             "parameters":{
                                 "path": "data/example_clustering_1_matrix"
                             }
                         }
     )
     s = SilhouetteCoefficientCalculator()
     matrix =  mh.create_matrix(None)
     print s.evaluate(clustering, matrix)
Example #15
0
    def get_best_clustering(self, parameters):
        best_clustering = None
        ##############################
        # Do the actual clustering
        ##############################
        clustering_results = None

        ##############################
        # Load the clustering
        ##############################
        if parameters["clustering"]["generation"]["method"] == "load":
            best_clustering = {"clustering":Clustering.from_dic(parameters["clustering"]["generation"])}

        ##############################
        # Or generate it
        ##############################
        elif parameters["clustering"]["generation"]["method"] == "generate":
            clustering_results = ClusteringProtocol(self.timer, self.observer).run(parameters, self.matrixHandler,
                                                                                                self.workspaceHandler,
                                                                                                self.trajectoryHandler)
            best_clustering = None
            abort = False

            if clustering_results != None:
                best_clustering_id, selected, not_selected, scores = clustering_results  # @UnusedVariable

                #################################
                # Abort if no clusters were found
                #################################
                if best_clustering_id is None:
                    abort = True

                best_clustering = selected[best_clustering_id]
            else:
                abort = True

            if abort:
                self.notify("SHUTDOWN", "Improductive clustering search. Relax evaluation constraints.")
                print "[FATAL Driver:get_best_clustering] Improductive clustering search. Exiting..."
                exit()

        return best_clustering, clustering_results
Example #16
0
        for j in range(0,N):
            if (i,j) in cluster.percents:
                data[str(i)].append( cluster.percents[(i,j)])
            else:
                data[str(i)].append(0)

    return data



if __name__ == '__main__':
    results = convert_to_utf8(json.loads(open(sys.argv[1]).read()))
    best_clustering_id =results["best_clustering"]
    best_clustering_dic = results["selected"][best_clustering_id]
    num_clusters = best_clustering_dic["clustering"]["number_of_clusters"]
    clustering = Clustering.from_dic(best_clustering_dic["clustering"])
    file_frames = int(sys.argv[2])

    # generate a map element -> interpolation
    index_to_interpolation = {}
    acc = 0
    for i in range(0, file_frames-1):
        for j in range(i+1, file_frames):
            for k in range(20):
                index_to_interpolation[acc] = (i,j)
                acc += 1


    for cluster in clustering.clusters:
        colors = iter(cm.rainbow(np.linspace(0, 1, N)))
        theta = radar_factory(N, frame='polygon')
Example #17
0
 
 #--------------------------------  
 # Prepare the clustering for this guy
 #--------------------------------
 ## Load template and modify its contents for this case
 CLUSTERING_PATH = os.path.join(RESULTS_PATH,"%s_%s_clustering"%(options.drug, options.protein))
 MAX_CLUSTERS = 10
 SCRIPT_PATH = os.path.join(RESULTS_PATH,"clustering.json")
 OUT_FILE = os.path.join(RESULTS_PATH, "clustering.out")
 script = load_dic_in_json(options.template)
 script["global"]["workspace"]["base"] = CLUSTERING_PATH
 script["data"]["files"].append(FILTERED_PDB_FILE)
 script["clustering"]["evaluation"]["maximum_clusters"] = MAX_CLUSTERS
 save_dic_in_json(script, SCRIPT_PATH)
 os.system("python -m pyproct.main %s > %s"%(SCRIPT_PATH, OUT_FILE))
 best_clustering = Clustering.from_dic(get_best_clustering(CLUSTERING_PATH)["clustering"])
  
 #--------------------------------
 # Now calculate the values
 #--------------------------------
 results = {}
 for cluster in best_clustering.clusters:
     energies = metrics[1][cluster.all_elements]
     distances = metrics[0][cluster.all_elements]
     results[cluster.id] = {}
     results[cluster.id]["max_energy"] = numpy.max(energies)
     results[cluster.id]["min_energy"] = numpy.min(energies)
     results[cluster.id]["mean_energy"] = numpy.mean(energies)
     results[cluster.id]["mean_distance"] = numpy.mean(distances)
     results[cluster.id]["population"] = len(cluster.all_elements)
      
Example #18
0
        parameters = ProtocolParameters.get_params_from_json(script_str)
        # And change another hypothesis stuff
        parameters["clustering"]["evaluation"]["maximum_noise"] = data.noise[dataset_name]
        parameters["clustering"]["evaluation"]["minimum_cluster_size"] = data.minsize[dataset_name]
        parameters["clustering"]["evaluation"]["minimum_clusters"] = data.num_cluster_ranges[dataset_name][0]
        parameters["clustering"]["evaluation"]["maximum_clusters"] = data.num_cluster_ranges[dataset_name][1]
        print parameters["clustering"]["evaluation"]["minimum_clusters"], parameters["clustering"]["evaluation"]["maximum_clusters"]
        if dataset_name in data.criteria:
            parameters["clustering"]["evaluation"]["evaluation_criteria"] = data.criteria[dataset_name]
        else:
            parameters["clustering"]["evaluation"]["evaluation_criteria"] = data.criteria["default"]
        Driver(Observer()).run(parameters)

    for dataset_name in ['concentric_circles']: #data.all_datasets:
        results_file = os.path.join(os.path.abspath("./tmp/%s"%dataset_name),"results/results.json")
        results = convert_to_utf8(json.loads(open(results_file).read()))
        best = results["best_clustering"]
        clustering = Clustering.from_dic(results["selected"][best]["clustering"])
        vtools.show_2D_dataset_clusters(all_observations[dataset_name],
                                        clustering,
                                        scale = 20,
                                        margin = 20).save("clustering_images/%s.jpg"%dataset_name,
                                                 "JPEG")
        print dataset_name,results["selected"][best]["type"],results["selected"][best]["clustering"]["number_of_clusters"], results["selected"][best]["evaluation"]["Noise level"],#results["selected"][best]["parameters"]
        # look for the best criteria
        criteria_scores = []
        for criteria in results["scores"]:
            criteria_scores.append((results["scores"][criteria][best],criteria))
        print criteria_scores

    print "\nDone"
Example #19
0
 def test_get_proportional_size_representatives(self):
     clusters = [ClusterMock(range(0,10)),ClusterMock(range(10,50)),ClusterMock(range(50,80)),ClusterMock(range(80,200))]
     clustering = Clustering(clusters)
     rep =  clustering.get_proportional_size_representatives(30, "distance_matrix" )
     self.assertItemsEqual(rep, [0, 0, 10, 10, 11, 12, 13, 14, 50, 50, 51, 52, 53, 80, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96])
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    # Plot protein
    pdb = prody.parsePDB(params["data"]["files"][0])
    if options.show_protein:
        pdb_backbone = pdb.select("name CA").getCoordsets()[0] # "backbone not hetero"
        ax.plot(pdb_backbone.T[0], pdb_backbone.T[1], pdb_backbone.T[2])

    # Get geometric centers and plot ligands
    ligand_coords = pdb.select(params["data"]["matrix"]["parameters"]["body_selection"]).getCoordsets()

    # Get clustering
    if options.clustering_to_see is None:
        options.clustering_to_see = results["best_clustering"]
    try:
        clustering = Clustering.from_dic(results["selected"][options.clustering_to_see]["clustering"])
        # Print some info
        print_cluster_info("selected", options.clustering_to_see, results)
    except:
        clustering = Clustering.from_dic(results["not_selected"][options.clustering_to_see]["clustering"])
        # Print some info
        print_cluster_info("not_selected", options.clustering_to_see, results)

    # Show all clusters
    colors = iter(cm.rainbow(numpy.linspace(0, 1, len(clustering.clusters))))
    for cluster in clustering.clusters:
        centers = []
        for i,element in enumerate(cluster.all_elements):
            if options.stride is None or i%options.stride == 0:
                coords = ligand_coords[element]
                centers.append(coords.mean(0))
Example #21
0
 def test_get_medoids(self):
     clusters = [ClusterMock(range(0,10)),ClusterMock(range(10,50)),ClusterMock(range(50,80)),ClusterMock(range(80,200))]
     clustering = Clustering(clusters)
     self.assertItemsEqual(clustering.get_medoids("distance_matrix"),[0, 10, 50, 80])
Example #22
0
 def load_clustering(self, parameters):
     best_clustering = {"clustering":Clustering.from_dic(parameters["clustering"]["generation"]["parameters"])}
     return ( "loaded_clustering", {"loaded_clustering":best_clustering}, {}, None)
#
# RCD_script = copy.deepcopy(template_script)
# RCD_script["global"]["workspace"]["base"] = os.path.join("RDCvsRMSD", "campari", "RDC", "clustering")
# RCD_script["data"]["matrix"]["method"] = "load"
# RCD_script["data"]["matrix"]["parameters"]["path"] = os.path.join("RDCvsRMSD", "campari", "RDC", "matrix")
# RCD_script["data"]["files"].append(os.path.join("RDCvsRMSD", "campari.pdb"))
#
# tools.save_dic_in_json(RCD_script, os.path.join("RDCvsRMSD", "campari", "RDC", "script.json"))
# tools.save_dic_in_json(RMSD_script, os.path.join("RDCvsRMSD", "campari", "RMSD", "script.json"))
#
# os.system("python %s %s "%(PYPROCT, os.path.join("RDCvsRMSD", "campari", "RDC", "script.json")))
# os.system("python %s %s "%(PYPROCT, os.path.join("RDCvsRMSD", "campari", "RMSD", "script.json")))


results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "RDC_refined", "clustering","results","results.json"))
RDC_clustering = Clustering.from_dic(results["selected"][results["best_clustering"]]["clustering"]).gen_class_list(number_of_elements = 5926)

results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "RMSD_refined", "clustering","results","results.json"))
RMSD_clustering = Clustering.from_dic(results["selected"][results["best_clustering"]]["clustering"]).gen_class_list(number_of_elements = 5926)

results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "Dihedral", "clustering","results","results.json"))
Dihedral_clustering = Clustering.from_dic(results["selected"][results["best_clustering"]]["clustering"]).gen_class_list(number_of_elements = 5926)

results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "Dihedral", "clustering","results","results.json"))
Dihedral_bad_score = Clustering.from_dic(results["selected"]["clustering_0098"]["clustering"]).gen_class_list(number_of_elements = 5926)

results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "Dihedral", "clustering","results","results.json"))
Dihedral_medium_score = Clustering.from_dic(results["selected"]["clustering_0056"]["clustering"]).gen_class_list(number_of_elements = 5926)

results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "Dihedral", "clustering","results","results.json"))
Dihedral_fairly_good_score = Clustering.from_dic(results["selected"]["clustering_0212"]["clustering"]).gen_class_list(number_of_elements = 5926)