def dkm_local_compute_clustering(args, config_file=CONFIG_FILE, **kwargs): """ # Description: Assign data instances to clusters. # PREVIOUS PHASE: remote_init_centroids (on first run only) remote_cehck_convergence # INPUT: | name | type | default | | --- | --- | --- | | config_file | str | config.cfg | | remote_centroids | list | None | | computation_phase | list | None | # OUTPUT: - centroids: list of numpy arrays # NEXT PHASE: remote_init_centroids """ state, inputs, cache = ut.resolve_args(args) config_file = ut.resolve_input('config_file', cache) remote_centroids = ut.resolve_input('remote_centroids', inputs) computation_phase = ut.resolve_input('computation_phase', inputs) ut.log('LOCAL: computing clustering', state) if remote_centroids is None: raise ValueError( "LOCAL: at local_compute_clustering - remote_centroids not passed correctly" ) if computation_phase is None: raise ValueError( "LOCAL: at local_compute_clustering - computation_phase not passed correctly" ) config = configparser.ConfigParser() config.read(config_file) ut.log('Config file is %s, with keys %s' % (config_file, str(dict(config))), state) data = np.load(config['LOCAL']['data_file']) cluster_labels = local.compute_clustering(data, remote_centroids) new_comp_phase = "dkm_local_compute_clustering" if computation_phase == "dkm_remote_optimization_step": new_comp_phase = "dkm_local_compute_clustering_2" computation_output = ut.default_computation_output(args) cache['cluster_labels'] = cluster_labels cache['remote_centroids'] = remote_centroids computation_output['output'] = dict( computation_phase=new_comp_phase, remote_centroids=remote_centroids, cluster_labels=cluster_labels ) computation_output['cache'] = cache return computation_output
def dkm_local_compute_optimizer(args, config_file=CONFIG_FILE, **kwargs): """ # Description: Compute local optimizers with local data. # PREVIOUS PHASE: local_compute_clustering # INPUT: | name | type | default | | --- | --- | --- | | config_file | str | config.cfg | | remote_centroids | list | None | | cluster_labels | list | None | # OUTPUT: - centroids: list of numpy arrays # NEXT PHASE: remote_init_centroids """ state, inputs, cache = ut.resolve_args(args) config_file = ut.resolve_input('config_file', cache) remote_centroids = ut.resolve_input('remote_centroids', inputs, cache) cluster_labels = ut.resolve_input('cluster_labels', inputs, cache) if remote_centroids is None: raise ValueError( "LOCAL: at local_compute_clustering - remote_centroids not passed correctly" ) if cluster_labels is None: raise ValueError( "LOCAL: at local_compute_clustering - cluster_labels not passed correctly" ) ut.log('LOCAL: computing optimizers', state) config = configparser.ConfigParser() config.read(config_file) data = np.load(config['LOCAL']['data_file']) k = int(config['LOCAL']['k']) learning_rate = config['LOCAL']['learning_rate'] optimization = config['LOCAL']['optimization'] if optimization == 'lloyd': local_optimizer = local.compute_mean(data, cluster_labels, k) elif optimization == 'gradient': # Gradient descent has sites compute gradients locally local_optimizer = \ local.compute_gradient(data, cluster_labels[i], remote_centroids, learning_rate) outdir = state['outputDirectory'] np.save(os.path.join(outdir, 'local_optimizer.npy'), local_optimizer) np.save(os.path.join(outdir, 'local_cluster_labels.npy'), cluster_labels) computation_output = dict(output=dict( local_optimizer=local_optimizer, computation_phase="dkm_local_compute_optimizer"), state=state ) return computation_output
def dkm_remote_optimization_step(args, config_file=CONFIG_FILE): """ # Description: Use optimizer to take the next step. # PREVIOUS PHASE: remote_aggregate_optimizer # INPUT: | name | type | default | | --- | --- | --- | | config_file | str | config.cfg | | remote_centroids | list | None | | remote_optimizer | list | None | # OUTPUT: - previous centroids: list of numpy arrays - remote centroids: list of numpy arrays # NEXT PHASE: remote_check_convergence """ state, inputs, cache = ut.resolve_args(args) config_file = ut.resolve_input('config_file', cache) remote_centroids = ut.resolve_input('remote_centroids', inputs, cache) remote_optimizer = ut.resolve_input('remote_optimizer', inputs, cache) if type(remote_centroids[0]) is not np.ndarray: remote_centroids = [np.array(c) for c in remote_centroids] ut.log('REMOTE: Optimization step', args['state']) config = configparser.ConfigParser() config.read(config_file) optimization = config['REMOTE']['optimization'] if optimization == 'lloyd': # Then, update centroids as corresponding to the local mean previous_centroids = remote_centroids[:] remote_centroids = remote_optimizer[:] ut.log("Previous centroids look like %s" % type(previous_centroids[0]), state) ut.log("Remote centroids look like %s" % type(remote_centroids[0]), state) elif optimization == 'gradient': # Then, update centroids according to one step of gradient descent [remote_centroids, previous_centroids] = local.gradient_step(remote_optimizer, remote_centroids) cache['previous_centroids'] = previous_centroids cache['remote_centroids'] = remote_centroids computation_output = dict(output=dict( computation_phase="dkm_remote_optimization_step", remote_centroids=remote_centroids), state=state, cache=cache) return computation_output
def dkm_remote_aggregate_output(args): """ # Description: Check convergence. # PREVIOUS PHASE: remote_check_convergence # INPUT: | name | type | default | | --- | --- | --- | | config_file | str | config.cfg | | remote_centroids | list | None | | previous_centroids | list | None | # OUTPUT: -remote_centroids """ state, inputs, cache = ut.resolve_args(args) remote_centroids = ut.resolve_input('remote_centroids', inputs, cache) ut.log('REMOTE: Aggregating input', state) computation_output = dict(output=dict( computation_phase="dkm_remote_aggregate_output", remote_centroids=remote_centroids, ), state=state, cache=cache) return computation_output
def dkm_remote_check_convergence(args, config_file=CONFIG_FILE): """ # Description: Check convergence. # PREVIOUS PHASE: remote_aggregate_optimizer # INPUT: | name | type | default | | --- | --- | --- | | config_file | str | config.cfg | | remote_centroids | list | None | | previous_centroids | list | None | # OUTPUT: - boolean encoded in name of phase - delta - remote_centroids # NEXT PHASE: remote_check_convergence """ state, inputs, cache = ut.resolve_args(args) ut.log('REMOTE: Check convergence', state) config_file = ut.resolve_input('config_file', cache) remote_centroids = ut.resolve_input('remote_centroids', inputs, cache) previous_centroids = ut.resolve_input('previous_centroids', inputs, cache) if type(remote_centroids) is not np.ndarray: remote_centroids = [np.array(c) for c in remote_centroids] if type(previous_centroids) is not np.ndarray: previous_centroids = [np.array(c) for c in previous_centroids] config = configparser.ConfigParser() config.read(config_file) epsilon = float(config['REMOTE']['epsilon']) remote_check, delta = local.check_stopping(remote_centroids, previous_centroids, epsilon) ut.log( 'REMOTE: Convergence Delta is %f, Converged is %s' % (delta, remote_check), state) new_phase = "dkm_remote_converged_true" if remote_check else "dkm_remote_converged_false" computation_output = dict(output=dict( computation_phase=new_phase, delta=delta, remote_centroids=remote_centroids, ), state=state, cache=cache) return computation_output
def dkm_remote_init_centroids(args, config_file=CONFIG_FILE, **kwargs): """ # Description: Initialize K centroids from locally selected centroids. # PREVIOUS PHASE: local_init_centroids # INPUT: | name | type | default | | --- | --- | --- | | config_file | str | config.cfg | # OUTPUT: - centroids: list of numpy arrays # NEXT PHASE: local_compute_optimizer """ state, inputs, cache = ut.resolve_args(args) ut.log('REMOTE: Initializing centroids', state) config_file = ut.resolve_input('config_file', cache) config = configparser.ConfigParser() config.read(config_file) ut.log('Config file %s, looks like %s' % (config_file, str(dict(config))), state) k = int(config['REMOTE']['k']) # Have each site compute k initial clusters locally local_centroids = [] if 'remote_centroids' in inputs.keys(): remote_centroids = inputs['remote_centroids'] elif 'remote_centroids' in cache.keys(): remote_centroids = cache['remote_centroids'] else: for site in inputs: ut.log( 'Local site %s sent inputs with keys %s' % (site, str(inputs[site].keys())), state) local_centroids += inputs[site]['local_centroids'] # and select k random clusters from the s*k pool np.random.shuffle(local_centroids) remote_centroids = local_centroids[:k] cache['config_file'] = config_file cache['remote_centroids'] = remote_centroids computation_output = dict( output=dict( work_dir='.', config_file=config_file, # local_centroids=remote_centroids, computation_phase="dkm_remote_init_centroids", remote_centroids=remote_centroids), state=state, cache=cache) return computation_output
def dkm_local_init_env(args, config_file=CONFIG_FILE, k=DEFAULT_k, optimization=DEFAULT_optimization, shuffle=DEFAULT_shuffle, learning_rate=DEFAULT_learning_rate, **kwargs): """ # Description: Initialize the local environment, creating the config file. # PREVIOUS PHASE: remote_init_env # INPUT: | name | type | default | | --- | --- | --- | | config_file | str | config.cfg | | k | int | 5 | | optimization | str | lloyd | | shuffle | bool | False | | data_file | str | data.txt | | learning_rate | float | 0.001 | # OUTPUT: - config file written to disk # NEXT PHASE: local_init_centroids """ state, inputs, cache = ut.resolve_args(args) data_file = ut.resolve_input('all_windows', cache) ut.log('LOCAL: Initializing remote environment', state) config_path = os.path.join(state['outputDirectory'], config_file) cache['config_file'] = config_path config = configparser.ConfigParser() config['LOCAL'] = dict(k=k, optimization=optimization, shuffle=shuffle, data_file=data_file, learning_rate=learning_rate) with open(config_path, 'w') as file: config.write(file) # output computation_output = dict( output=dict( config_file=config_path, computation_phase="dkm_local_init_env"), state=state, cache=cache ) return computation_output
def dkm_remote_aggregate_optimizer(args, config_file=CONFIG_FILE): """ # Description: Aggregate optimizers sent from local nodes. # PREVIOUS PHASE: local_compute_optimizer # INPUT: | name | type | default | | --- | --- | --- | | config_file | str | config.cfg | # OUTPUT: - remote_optimizer: list of K numpy arrays # NEXT PHASE: remote_optimization_step """ state, inputs, cache = ut.resolve_args(args) config_file = ut.resolve_input('config_file', cache) config = configparser.ConfigParser() config.read(config_file) optimization = config['REMOTE']['optimization'] ut.log('REMOTE: Aggregate optimizer', state) local_optimizers = [inputs[site]['local_optimizer'] for site in inputs] s = len(local_optimizers) remote_optimizer = remote.aggregate_sum(local_optimizers) if not all([type(r) is np.ndarray for r in remote_optimizer]): try: remote_opt2 = [np.array(c) for c in remote_optimizer] remote_optimizer = remote_opt2[:] except Exception as e: raise (Exception("Hit valueerror. Remote optimizer types are %s" % ([len(r) for r in remote_optimizer]))) if optimization == 'lloyd': # for the mean, we need to further divide the number of sites try: remote_optimizer = [r / s for r in remote_optimizer] except Exception as e: raise (Exception("Hit valueerror. Remote optimizer types are %s" % ([len(r) for r in remote_optimizer]))) cache['remote_optimizer'] = remote_optimizer computation_output = dict(output=dict( remote_optimizer=remote_optimizer, computation_phase="dkm_remote_aggregate_optimizer"), state=state, cache=cache) return computation_output
def dkm_local_init_centroids(args, config_file=CONFIG_FILE, **kwargs): """ # Description: Initialize K centroids from own data. # PREVIOUS PHASE: local_init_env # INPUT: | name | type | default | | --- | --- | --- | | config_file | str | config.cfg | # OUTPUT: - centroids: list of numpy arrays # NEXT PHASE: remote_init_centroids """ state, inputs, cache = ut.resolve_args(args) config_file = ut.resolve_input('config_file', cache) ut.log('LOCAL: Initializing centroids', state) config = configparser.ConfigParser() config.read(config_file) data = np.load(config['LOCAL']['data_file']) centroids = local.initialize_own_centroids(data, int(config['LOCAL']['k'])) np.save(os.path.join(state['outputDirectory'], 'initial_centroids'), 'centroids') ut.log('Local centroids looks like %s' % (str(type(centroids))), state) # output cache['local_centroids'] = centroids computation_output = dict(output=dict( config_file=config_file, local_centroids=centroids, computation_phase="dkm_local_init_env"), state=state, cache=cache ) return computation_output