コード例 #1
0
def _train_clustering(point_set, distance_name, k):
    labels = np.array([0]*len(point_set))
    features = np.zeros((2, len(point_set)))

    for i in xrange(len(point_set)):
        features[0, i] = point_set[i]['x']
        features[1, i] = point_set[i]['y']
        labels[i] = point_set[i]['label']

    lab = sg.BinaryLabels(labels)
    train = sg.RealFeatures(features)
             
    if distance_name == "EuclideanDistance":
        distance = sg.EuclideanDistance(train, train)
    elif distance_name == "ManhattanMetric":
        distance = sg.ManhattanMetric(train, train)
    elif distance_name == "JensenMetric":
        distance = sg.JensenMetric(train, train)
    else:
        raise TypeError
                  
    kmeans = sg.KMeans(k, distance)
    kmeans.train()

    return kmeans
コード例 #2
0
ファイル: run_base.py プロジェクト: xinyin1990/ml-perf
def shogunProcess(clustersNumber, dataLessTarget, datasetName, runinfo = None, initialClusters = None):
    import shogun

    outputFile = datasetOutFile(datasetName, SHOGUN_ALGO, runinfo=runinfo)

    if os.path.exists(outputFile):
        print("shogun skipped")
        return

    train_features = shogun.RealFeatures(dataLessTarget.values.astype("float64").transpose())
    # distance metric over feature matrix - Euclidean distance
    distance = shogun.EuclideanDistance(train_features, train_features)

    hierarchical = shogun.Hierarchical(clustersNumber, distance)

    #TODO Makes the pyhon process dies!!!???!!!

    d = hierarchical.get_merge_distances()
    cp = hierarchical.get_cluster_pairs()
    
    with open(outputFile, 'w') as csvfile:
        filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)
    
        for index, row in dataLessTarget.iterrows():
            filewriter.writerow([index, result[index].item(0)])
コード例 #3
0
    def run_kmeans_base(self,
                        nb_clusters,
                        src_file,
                        data_without_target,
                        dataset_name,
                        run_number,
                        config_function,
                        run_info=None,
                        nb_iterations=None):
        self._init()
        output_file, centroids_file = self._prepare_files(
            dataset_name, run_info, True)

        train_features = shogun.RealFeatures(
            data_without_target.values.astype("float64").transpose())
        # distance metric over feature matrix - Euclidean distance
        distance = shogun.EuclideanDistance(train_features, train_features)

        # KMeans object created
        kmeans = shogun.KMeans(nb_clusters, distance)
        if config_function is not None:
            config_function(kmeans)

        if nb_iterations is not None:
            kmeans.set_max_iter(nb_iterations)

        centers, result = Shogun._kmeans_process(kmeans)
        ClusteringToolkit._save_clustering(
            Shogun._clustering_to_list(data_without_target, result),
            output_file)
        ClusteringToolkit._save_centroids(Shogun._centroids_to_list(centers),
                                          centroids_file)

        return output_file, {"centroids": centroids_file}
コード例 #4
0
ファイル: run_base.py プロジェクト: xinyin1990/ml-perf
def shogunProcess(clustersNumber,
                  dataLessTarget,
                  datasetName,
                  runinfo=None,
                  initialClusters=None):
    import shogun

    outputFile = datasetOutFile(datasetName, SHOGUN_ALGO, runinfo=runinfo)
    clustersOutputFile = datasetOutFile(datasetName,
                                        centroidFor(SHOGUN_ALGO),
                                        runinfo=runinfo)

    if os.path.exists(outputFile) and os.path.exists(clustersOutputFile):
        print("shogun skipped")
        return

    train_features = shogun.RealFeatures(
        dataLessTarget.values.astype("float64").transpose())
    # distance metric over feature matrix - Euclidean distance
    distance = shogun.EuclideanDistance(train_features, train_features)

    # KMeans object created
    kmeans = shogun.KMeans(clustersNumber, distance)

    if initialClusters is None:
        # set KMeans++ flag
        kmeans.set_use_kmeanspp(True)
    else:
        # set new initial centers
        kmeans.set_initial_centers(
            initialClusters.astype("float64").transpose())

    # KMeans training
    kmeans.train()

    # cluster centers
    centers = kmeans.get_cluster_centers()

    # Labels for data points
    result = kmeans.apply()

    with open(outputFile, 'w') as csvfile:
        filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL)

        for index, row in dataLessTarget.iterrows():
            filewriter.writerow([index, result[index].item(0)])

    with open(clustersOutputFile, 'w') as clusterFile:
        filewriter = csv.writer(clusterFile, quoting=csv.QUOTE_MINIMAL)

        for row in centers.transpose():
            filewriter.writerow(row.tolist())
コード例 #5
0
    def run_hierarchical(self,
                         nb_clusters,
                         src_file,
                         data_without_target,
                         dataset_name,
                         run_number,
                         run_info=None):
        output_file, = self._prepare_files(dataset_name, run_info, False)

        train_features = shogun.RealFeatures(
            data_without_target.values.astype("float64").transpose())
        # distance metric over feature matrix - Euclidean distance
        distance = shogun.EuclideanDistance(train_features, train_features)

        hierarchical = shogun.Hierarchical(nb_clusters, distance)
コード例 #6
0
ファイル: dm.py プロジェクト: zym-wade/shogun
# load data
feature_matrix = data.swissroll()
# create features instance
features = sg.RealFeatures(feature_matrix)

# create Diffusion Maps converter instance
converter = sg.DiffusionMaps()

# set target dimensionality
converter.set_target_dim(2)
# set number of time-steps
converter.set_t(2)
# set width of gaussian kernel
converter.set_width(10.0)

# create euclidean distance instance
distance = sg.EuclideanDistance()
# enable converter instance to use created distance instance
converter.set_distance(distance)

# compute embedding with Diffusion Maps method
embedding = converter.embed(features)

# compute custom distance matrix
distance_matrix = np.exp(-np.dot(feature_matrix.T,feature_matrix))
# create Custom Kernel instance
custom_distance = sg.CustomDistance(distance_matrix)
# construct embedding based on created distance
distance_embedding = converter.embed_distance(custom_distance)