Beispiel #1
0
    def algorithms_available(self):
        """ 
		Call this to list all algorithms currently available in algorithms.py
		"""
        algorithms = ca.clustering_algorithms(self.dataObj.D['parent'], {})
        ALG_FCN_DICT = algorithms.clustering_algorithms_available()
        return ALG_FCN_DICT
Beispiel #2
0
    def cluster(self,
                source_name,
                algorithm,
                output_name,
                K=None,
                Require_Unique=False,
                random_seed=None,
                **kwargs):
        """
		This runs clustering algorithms on the data matrix defined by
		source_name with parameters that are variable for each algorithm. Note that K is 
		required for most algorithms. 
		
		Parameters
		----------
		source_name: string
			the source data matrix name to operate on in clusterclass dataObj
		algorithm: string
			name of the algorithm to use, see clustering.py or call oe.cluster.algorithms_available()
		output_name: string
			this is the dict key for interacting with the results of this clustering solution in any of the cluster class dictionary attributes
		K: int
			number of clusters to create (ignored for algorithms that define K during clustering). The var_params gets K after, either the parameter passed, or the number of clusters produced
			if the K was not passed.
		Require_Unique: bool
			If FALSE and you already have an output_name solution, this will append a number to create a unique name. If TRUE and a 
			solution by that name exists, this will not add solution and raise ValueError. Default Require_Unique=False
		random_seed: int or random.getstate()
			Pass a random seed or random seed state (random.getstate()) in order to force the starting point of a clustering algorithm to that state. 
			Default is None

		Warnings
		--------
		This will warn if the number of clusters is differen than what was requested, typically when an algorithm does not accept K as an argument.

		Raises
		------
			ValueError
				if data source is not available by source_name 

		Examples
		--------
		Cluster using KMeans on parent data

		>>> c = oe.cluster
		>>> c.cluster('parent', 'kmeans','kmeans_parent', K=5) 

		Form an iteration to build an ensemble using different values for K

		>>> for k in range(2,12):
		>>>     name='kmeans_'+k
		>>>     c.cluster('parent', 'kmeans', name, k)

		
		"""
        #CHECK that the source exists
        if source_name not in self.dataObj.D:
            raise ValueError(
                "ERROR: the source you requested for clustering does not exist by that name %s"
                % (source_name))
        ALG_FCN_DICT = self.algorithms_available()
        paramDict = {}

        if not kwargs:
            var_params = {}
        else:
            var_params = kwargs

        #Here if handle if random seed was passed, set it. Else, store the random seed.
        if 'random_seed':
            try:
                random.set_state(random_seed)
                state = random_seed

            except TypeError:
                random.seed(random_seed)
                state = random.get_state()

        var_params['random_state'] = state

        ##### Check to see if the same name exists for clustering solution name and decide what to do according to Require_Unique
        if output_name in list(self.labels.keys()):
            if Require_Unique:
                warnings.warn(
                    'The name of the clustering solution is redundant and you required unique, solution will not be added'
                )
                return
            else:
                test_name = "%s_%d" % (output_name, randint(0, 10000))
                while test_name in self.labels:
                    test_name = "%s_%d" % (output_name, randint(0, 10000))
                output_name = test_name
                warnings.warn(
                    'For uniqueness, altered output_name to be %s' %
                    (output_name), UserWarning)

        ######BEGIN CLUSTERING BLOCK  ######
        if algorithm not in ALG_FCN_DICT:
            raise ValueError(
                "The algorithm you requested does not exist, currently the following are supported %s"
                % (list(ALG_FCN_DICT.keys())))

        random.set_state(state)
        c = ca.clustering_algorithms(self.dataObj.D[source_name], var_params,
                                     K)
        func = getattr(c, algorithm)
        func()

        #### FINAL staging, c now contains a finished assignment and c.params has final parameters used.

        # CHECK that K is as requested
        uniqueClusters = np.unique(c.out)
        if K:  #check if K was overwritten
            c.var_params['K'] = K
            if len(uniqueClusters) != K:
                warnings.warn(
                    "Number of unique clusters %d returned does not match number requested %d for solution: %s"
                    % (len(uniqueClusters), K, output_name), UserWarning)
        else:
            c.var_params['K'] = len(uniqueClusters)

        self.labels[output_name] = c.out
        self.data_source[output_name] = source_name
        self.params[output_name] = c.var_params
        self.clusterNumbers[output_name] = uniqueClusters
        self.algorithms[output_name] = algorithm
        self.random_state[output_name] = state