def vcenterdatacenter_get_clusters(self, label, vcenter, tenantname, xml=False): ''' Makes a REST API call to retrieve details of a vcenterdatacenter based on its UUID ''' uri = self.vcenterdatacenter_query(label, vcenter, tenantname) (s, h) = common.service_json_request( self.__ipAddr, self.__port, "GET", VcenterDatacenter.URI_DATACENTER_CLUSTERS.format(uri), None, None, xml) o = common.json_decode(s) from cluster import Cluster obj = Cluster(self.__ipAddr, self.__port) dtlslst = obj.cluster_get_details_list(o['cluster']) return dtlslst
def run(self, points, random_seed): random.seed(random_seed) # Randomly initiate clusters self._clusters = [] initial_centroids = random.sample(points, self._k) for i, initial_centroid in enumerate(initial_centroids): new_cluster = Cluster(i, initial_centroid) self._clusters.append(new_cluster) for current_iteration in range(self._num_iterations): # Clear all clusters for cluster in self._clusters: cluster.remove_point() # Re-assign all points for point in points: distances_to_clusters = {x.id: point.distance_to(x.centroid) for x in self._clusters} closest_cluster_id = sorted(distances_to_clusters.keys(), key=lambda x: distances_to_clusters[x])[0] self._clusters[closest_cluster_id].add_point(point) # Recompute centroids and look if change happened changes = [cluster.compute_centroid() for cluster in self._clusters] if sum(changes) == 0: # if everyone is False then sum is 0 break
def cluster_by_kmeans(): """ Use KMeans to group similar words. """ import os, sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../'))) from cluster import Cluster from collections import defaultdict import random clustered_corpus = [] corpus = _unpickle(config['corpus']['fun2vec']) clf = Cluster(config['cluster']['kmeans']) m = Model('word2vec') cluster = dict(zip(m.vocab, clf.predict(m.vector))) del m, clf # memory friendly for i, words in enumerate(corpus, 1): centroids = defaultdict(list) for word in words: label = cluster.get(word) centroids[label].append(word) # if there are the words which have the same labels, randomly choose one of them and remove others. clustered_words = [random.choice(v) if k is not None and len(v) >= 2 else v[0] for k, v in centroids.items()] if len(clustered_words) >= 2: clustered_corpus.append(clustered_words) if i < 100: print('----------------------------') print(words) print(clustered_words) if i % 10000 == 0: _logger.info(f'Finished {i} profiles') _pickle(clustered_corpus, config['corpus']['fun2vec_clustered']) _logger.info(f"Saved corpus of {len(clustered_corpus)} profiles in {config['corpus']['fun2vec_clustered']}")
def __init__(self, clustername, ra=None, dec=None, diam=None, dist=None, pmra=None, pmdec=None, pmradius=5.0, depth=0.5): ''' Constructor ''' self.clustername = clustername self.cluster = Cluster(clustername) self.coordinates = self.cluster.coordinates print(self.cluster) def setnotNone(default, clustervalue): if default is None: if clustervalue is None: raise ValueError return clustervalue else: return default self.ra = setnotNone(ra, self.cluster['ra']) self.dec = setnotNone(dec, self.cluster['dec']) self.diam = setnotNone(diam, self.cluster['diam']) self.dist = setnotNone(dist, self.cluster['d']) self.pmra = setnotNone(pmra, self.cluster['pmra']) self.pmdec = setnotNone(pmdec, self.cluster['pmdec']) self.pmradius = pmradius self.depth = depth
def solve(self, data, init_method=0): self.data = data #print ( # 'Solving for %d clusters with %d data points' % ( # self.count, len(self.data))) if len(self.data) < self.count: raise Exception('Fewer data points than expected clusters') self.initial_clusters(init_method) prior_centroids = [] while self.centroids != prior_centroids: prior_centroids = self.centroids[:] self.centroids = [ self.centroid(c.get_items()) for c in self.clusters ] self.clusters = [Cluster(c) for c in self.centroids] self.assign_to_clusters() if min([len(c.get_items()) for c in self.clusters]) == 0: print('Error clustering: empty cluster(s)') print([len(c.get_items()) for c in self.clusters])
def __init__(self): super(ClusterStack, self).__init__(Cluster(), 'config_records.py')
def save_avg_result(*option): """ Save results to file :param option: optional inputs can be: save_avg_result(pat_path) or save_avg_result(pat_path, bb_log_path) or save_avg_result(pat_path, bb_log_path, BB_Phase) :return: None """ if len(option) == 1: # only pat_path is assigned result_file = option[0] + os.sep + 'results.log' attrib_avg = Cluster(option[0]).get_cluster_data_by_time([0], [0], False) with open(result_file, 'w') as f: f.write('*' * 110 + '\n') f.write('All nodes average utilization\n') f.write('*' * 110 + '\n') for key in attrib_avg.keys(): f.write('All nodes average {0} utilization: \n {1} \n'.format( key, attrib_avg.get(key).to_string(index=False))) f.write('.' * 75 + '\n') print 'Results have been saved to: {0}'.format(result_file) return elif len(option) == 2: # pat_path and bb_log are assigned result_file = option[0] + os.sep + 'results.log' phase_name = ('BENCHMARK', 'LOAD_TEST', 'POWER_TEST', 'THROUGHPUT_TEST_1', 'VALIDATE_POWER_TEST', 'VALIDATE_THROUGHPUT_TEST_1') with open(result_file, 'w') as f: for phase in phase_name[0:4]: start_stamp, end_stamp = BBParse( option[1]).get_stamp_by_phase(phase) start_time = datetime.fromtimestamp(start_stamp).strftime( '%Y-%m-%d %H:%M:%S') end_time = datetime.fromtimestamp(end_stamp).strftime( '%Y-%m-%d %H:%M:%S') attrib_avg = Cluster(option[0]).get_cluster_avg( start_stamp, end_stamp) f.write('*' * 110 + '\n') f.write( 'All nodes average utilization for phase {0} between {1} and {2}:\n' .format(phase, start_time, end_time)) f.write('*' * 110 + '\n') for key in attrib_avg.keys(): f.write( 'All nodes average {0} utilization: \n {1} \n'.format( key, attrib_avg.get(key).to_string(index=False))) f.write('.' * 75 + '\n') print 'Results have been saved to: {0}'.format(result_file) return elif len(option) == 3: # pat_path, bb_log and phase_name are assigned result_file = option[0] + os.sep + 'results.log' with open(result_file, 'w') as f: start_stamp, end_stamp = BBParse(option[1]).get_stamp_by_phase( option[2]) start_time = datetime.fromtimestamp(start_stamp).strftime( '%Y-%m-%d %H:%M:%S') end_time = datetime.fromtimestamp(end_stamp).strftime( '%Y-%m-%d %H:%M:%S') attrib_avg = Cluster(option[0]).get_cluster_avg( start_stamp, end_stamp) f.write('*' * 110 + '\n') f.write( 'All nodes average utilization for phase {0} between {1} and {2}:\n' .format(option[2], start_time, end_time)) f.write('*' * 110 + '\n') for key in attrib_avg.keys(): f.write('All nodes average {0} utilization: \n {1} \n'.format( key, attrib_avg.get(key).to_string(index=False))) f.write('.' * 75 + '\n') print 'Results have been saved to: {0}'.format(result_file) return else: print 'Usage: save_avg_result(pat_path) or save_avg_result(pat_path, bb_log_path) or ' \ 'save_avg_result(pat_path, bb_log_path, BB_Phase)\n' exit(-1)
# Javier de Martin Gil @ 2020 from Data_mgmt import Data_mgmt from neural_model import Neural_Model import os import sys from cluster import Cluster # Parameters # [1]: CITY # [2]: INFLUX DB PASSWORD # [3]: LOCAL DATA OR QUERY REMOTE DB cluster = Cluster(city=sys.argv[1]) labels = cluster.do_cluster() data = Data_mgmt(city=sys.argv[1]) dataset = data.read_dataset() data.iterate(dataset=dataset, cluster_data=labels) data.supervised_learning() data.split_sets(0.8, 0.15, 0.5) m = Neural_Model() m.fit_model() m.test_models_score() dataToPredict = data.prepare_tomorrow() m.tomorrow(data=dataToPredict, append_to_db=False)
def __init__(self): self.cluster = Cluster() self.nodeList = {} self.nodeInfo = [] self.data = []
from log_parser import LogParser from cluster import Cluster, NodeFactory from converter import LogsToEventsConverter from events import EventLoop N = 4 WIDTH = HEIGHT = 600 TITLE = "Fast Paxos" LOGFILE = "../../../logs/test.log" factory = NodeFactory((20, 20), 8) cluster = Cluster(TITLE, N, (WIDTH, HEIGHT), factory) loop = EventLoop() logs = LogParser.parse(LOGFILE) converter = LogsToEventsConverter(cluster, loop) converter.convert(logs) loop.run() cluster.close()
initial_ownership).topTradingCycles() end_time = datetime.now() print("mttc_allocation =", mttc_allocation) """ register the 'Path' of input data (job.json, stage.json, runtime.json) """ json_dir = "./" """ Run this simulation with initial_ownership """ machines = [Machine(i, core_per_machine[i]) for i in range(0, machine_number)] users = [ User(i, initial_ownership[i], preference_value[i]) for i in range(user_number) ] cluster = Cluster(machines, users, num_core) simulator = Simulator(cluster, preference_value, json_dir, user_number, flag="Initial") cluster.totalJobNumber = 100 simulator.scheduler.scheduler_type = "isolated" simulator.run() """ Run this simulation with Choosy + DS scheduling """ machines = [Machine(i, core_per_machine[i]) for i in range(0, machine_number)] users = [
def __init__(self, config_file=None, cluster=None): # type: (str, Cluster) -> None """ Constructor for Deploy :param config_file: location of the rs-conf.yml file :param cluster: A cluster object to initialize with """ # Read main configuration file with open(config_file, 'r') as config_yaml_file: config_dict = yaml.load(config_yaml_file) self.redstack_version = config_dict['redstack_version'] self.directory_base = config_dict['deployment_directory_base'] self.installation_directory = config_dict['installation_directory'] self.cookbook_directory = config_dict['cookbook_directory'] self.log_path = config_dict['log_path'] self.log_level = config_dict['log_level'] self.stack_name = config_dict['stack_name'] self.auth_version = config_dict['auth_version'] self.image_name = config_dict['image_name'] self.availability_zone = config_dict['availability_zone'] self.region = config_dict['region'] self.openstack_auth_url = config_dict['openstack_auth_url'] self.external_network_id = config_dict['external_network_id'] self.try_existing_network = config_dict['try_existing_network'] self.subnet_cidr = config_dict['subnet_cidr'] self.expose_ui_ssh = config_dict['expose_ui_ssh'] self.subnet_dns_nameservers = config_dict['subnet_dns_nameservers'] self.cacert = config_dict['cacert'] self.ost_username = config_dict['ost_username'] self.ost_password = config_dict['ost_password'] self.ost_project_id = config_dict['ost_project_id'] self.ost_project_name = config_dict['ost_project_name'] self.ost_domain = config_dict['ost_domain'] self.use_existing_openstack = config_dict['use_existing_openstack'] self.key_name = config_dict['key_name'] self.stack_type = config_dict['stack_type'] self.template_name = config_dict['template_file'] self.hdp_major_version = config_dict['hdp_major_version'] self.hdp_version = config_dict['hdp_version'] self.hdp_utils_version = config_dict['hdp_utils'] self.define_custom_repos = config_dict['define_custom_repos'] self.ambari_version = config_dict['ambari_version'] self.ambari_password = config_dict['ambari_password'] self.fqdn_address = config_dict['fqdn_address'] self.kerberos_realm = config_dict['kerberos_realm'] self.kerberos_password = config_dict['kerberos_password'] self.volume_device = config_dict['volume_device'] self.mount_location = config_dict['mount_location'] self.chef_rpm_uri = config_dict['chef_rpm_uri'] self.chef_version = config_dict['chef_version'] self.chef_tries = config_dict['chef_tries'] self.log_chef_to_stdout = config_dict['log_chef_to_stdout'] self.ambari_db_password = config_dict['ambari_db_password'] self.mysql_root_password = config_dict['mysql_root_password'] # To be set when the blueprints are created self.blueprint = None self.host_mapping = None self.stack_definition = None self.utils_definition = None # Set the deploy name and directory based on the current time self.name = "{0}-{1}".format(config_dict["cluster_name"], str(int(time.time()))) self.directory = os.path.join(config_dict['deployment_directory_base'], self.name) # Initialize the cluster object based on whether or not a cluster json file was passed if not cluster: # Read cluster template file template_file = '{0}/conf/templates/{1}'.format( config_dict['installation_directory'], config_dict['template_file']) self.cluster = Cluster( cluster_name=config_dict['cluster_name'], ssh_user=config_dict['ssh_user'], private_key=config_dict['existing_key_location'], key_name=config_dict['key_name'], template_file=template_file, fqdn_address=self.fqdn_address) else: self.cluster = cluster
def FoF( galaxy_data, candidate_centers, richness, overdensity, max_velocity=2000 * u.km / u.s, linking_length_factor=0.1, virial_radius=1.5 * u.Mpc / u.littleh, ): """ The Friends-of-Friends algorithm is a clustering algorithm used to identify groups of particles. In this instance, FoF is used to identify clusters of galaxies. FoF uses a linking length, l, whereby galaxies within a distance l from another galaxy are linked directly (as friends) and galaxies within a distance l from its friends are linked indirectly (as friends of friends). This network of particles are considered a cluster. After locating all candidate clusters, overlapping clusters are merged, with preference towards the center with larger N(d) and abs magnitude. A new cluster center is then defined as the brightess galaxy within 0.5 Mpc away from the current center. Finally, a cluster is only initialized if it has met the threshold richness and overdensity. The algorithm is sped up with: - numpy vectorization - grispy nearest neighbor implementation, which uses cell techniques to efficiently locate neighbors. This is preferred as it allows the use of the haversine metric for spherical coordinates. Parameters ---------- galaxy_data: ndarray, shape (n,7) Galaxy data with compulsory properties: ['ra', 'dec', 'z', 'abs mag', 'id', 'LR', 'N'] candidate_centers: ndarray, shape (m,7) Array of candidate centers with compulsory properties: ['ra', 'dec', 'z', 'abs mag', 'id', 'LR', 'N'] max_velocity: float, units [km/s] Default value: 2000 km/s linking_length_factor: float Default value: 0.1 virial_radius: float, units [Mpc/littleh] Default value: 1.5 hMpc richness: integer overdensity: float Returns ------- candidates: list of cluster.Cluster object """ candidates = [] # sep_arr = [] # tracks change in linking length with redshift # tracker identifies galaxies that have been included in another cluster previously to speed up algorithm. # candidate_centers was sorted by N(0.5) before to ensure larger clusters are prioritized tracker = np.ones(len(candidate_centers)) # identify cluster candidates for i, center in enumerate( candidate_centers ): # each row is a candidate center to search around if tracker[i]: velocity_bin = galaxy_data[ abs(redshift_to_velocity(galaxy_data[:, 2], center[2])) <= max_velocity] # select galaxies within max velocity virial_gsp = GriSPy(velocity_bin[:, :2], metric="haversine") # given virial radius is in proper distances, we convert to comoving distance to account for cosmological expansion. ang_virial_radius = linear_to_angular_dist( virial_radius, center[2] ).to("rad") # convert proper virial radius to angular separation max_dist = ( ang_virial_radius * cosmo.comoving_transverse_distance(center[2])).to( u.Mpc, u.dimensionless_angles()) # convert to comoving distance max_dist = linear_to_angular_dist( max_dist, center[2] ).value # convert comoving distance to angular separation virial_dist, virial_idx = virial_gsp.bubble_neighbors( np.array([center[:2]]), distance_upper_bound=max_dist ) # center must be a ndarray of (n,2) virial_points = velocity_bin[tuple( virial_idx)] # convert to tuple for deprecation warning if ( len(virial_points) >= 12 ): # reject if <12 galaxies within virial radius (to save time) mean_sep = mean_separation( len(virial_points), center[2], max_dist * u.degree, max_velocity, survey_area=1.7, ) # Mpc linking_length = ( linking_length_factor * mean_sep ) # determine transverse LL from local mean separation # sep_arr.append([linking_length.value, center[2]]) linking_length = linear_to_angular_dist( linking_length, center[2]).value # fix linking length here f_gsp = GriSPy(virial_points[:, :2], metric="haversine") f_dist, f_idx = f_gsp.bubble_neighbors( np.array([center[:2]]), distance_upper_bound=linking_length ) # select galaxies within linking length f_points = virial_points[tuple(f_idx)] member_galaxies = f_points fof_dist, fof_idx = f_gsp.bubble_neighbors( f_points[:, :2], distance_upper_bound=linking_length ) # select all galaxies within 2 linking lengths for idx in fof_idx: fof_points = virial_points[idx] # ensure no repeated points in cluster mask = np.isin( fof_points, member_galaxies, invert=True ) # filter for points not already accounted for vec_mask = np.isin(mask.sum(axis=1), center.shape[0]) fof_points = fof_points[vec_mask].reshape( (-1, center.shape[0])) # points of 2 linking lengths (FoF) if len(fof_points): member_galaxies = np.concatenate( (member_galaxies, fof_points) ) # merge all FoF points within 2 linking lengths if len(member_galaxies) >= richness: # must have >= richness c = Cluster(center, member_galaxies) candidates.append(c) if not i % 100: logging.info(f"{i} " + c.__str__()) # locate centers within member_galaxies (centers of interest) member_gal_id = member_galaxies[:, 4] luminous_gal_id = candidate_centers[:, 4] coi, _, coi_idx = np.intersect1d(member_gal_id, luminous_gal_id, return_indices=True) # update tracker to 0 for these points for i in coi_idx: tracker[i] = 0 # if len(candidates) >= 100: # for quick testing # break # plot_clusters(candidates, flagging=False) # for quick check of clusters # tracks mean separation across redshift # sep_arr = np.array(sep_arr) # plt.plot(sep_arr[:,1], sep_arr[:,0], '.') # plt.show() # perform overlap removal and merger print("Performing overlap removal") candidate_clusters = np.array([ [c.ra, c.dec, c.z, c.gal_id] for c in candidates ]) # get specific attributes from candidate center sample candidates = np.array(candidates) merged_candidates = candidates.copy() gal_id_space = candidate_clusters[:, 3] for center in candidates: # identity overlapping centers (centers lying within virial radius of current cluster) velocity_bin = candidate_clusters[ abs(redshift_to_velocity(candidate_clusters[:, 2], center.z)) <= max_velocity] # select galaxies within max velocity center_gsp = GriSPy(velocity_bin[:, :2], metric="haversine") c_coords = [center.ra, center.dec] max_dist = linear_to_angular_dist( virial_radius, center.z).value # convert virial radius to angular distance c_dist, c_idx = center_gsp.bubble_neighbors( np.array([c_coords]), distance_upper_bound=max_dist) # center must be a ndarray of (n,2) c_points = velocity_bin[tuple(c_idx)] # merge each overlapping cluster if len(c_points): for c in c_points: c = candidates[gal_id_space == c[-1]][0] if center.gal_id == c.gal_id: # if same center, ignore continue # modify the cluster's galaxies in merged_candidates array if len(c.galaxies) and len( center.galaxies): # check both clusters are not empty S = setdiff2d( c.galaxies, center.galaxies) # identify overlapping galaxies if len(S): new_c = merged_candidates[gal_id_space == c.gal_id][ 0] # c from merged_candidates new_center = merged_candidates[ gal_id_space == center.gal_id][ 0] # center from merged_candidates c_galaxies, center_galaxies = c.remove_overlap(center) new_c.galaxies = c_galaxies new_center.galaxies = center_galaxies merged_candidates = np.array([ c for c in merged_candidates if c.richness >= richness ]) # select only clusters >= richness if len(merged_candidates) >= len(candidates): logging.warning("No candidates were merged!") bcg_clusters = merged_candidates.copy() # replace candidate center with brightest galaxy in cluster print("Searching for BCGs") merged_candidates = sorted(merged_candidates, key=lambda x: x.N, reverse=True) # sort by N for center in merged_candidates: bcg_space_gal_id = np.array([c.gal_id for c in bcg_clusters]) # identify galaxies within 0.25*virial radius cluster_gsp = GriSPy( center.galaxies[:, :2], metric="haversine") # for galaxies within a cluster c_coords = [center.ra, center.dec] max_dist = 0.25 * (linear_to_angular_dist(virial_radius, center.z).value ) # convert virial radius to angular distance c_dist, c_idx = cluster_gsp.bubble_neighbors( np.array([c_coords]), distance_upper_bound=max_dist) # center must be a ndarray of (n,2) bcg_arr = center.galaxies[tuple(c_idx)] if len(bcg_arr) and len( center.galaxies ): # check for galaxies within 0.25*virial radius mag_sort = bcg_arr[bcg_arr[:, 3].argsort( )] # sort selected galaxies by abs mag (brightness) mask = np.isin( mag_sort[:, 4], bcg_space_gal_id, invert=True ) # filter for galaxies that are not existing centers mag_sort = mag_sort[mask] if len(mag_sort): bcg = mag_sort[0] # brightest cluster galaxy (bcg) # if bcg brighter than current center, replace it as center if (abs(bcg[3]) > abs(center.bcg_absMag)) and (bcg[4] != center.gal_id): new_cluster = Cluster( bcg, center.galaxies) # initialize new center bcg_clusters = np.delete( bcg_clusters, np.where([c.gal_id for c in bcg_clusters] == center.gal_id), ) bcg_clusters = np.concatenate( (bcg_clusters, np.array([new_cluster]))) # add new center to array bcg_clusters = np.array([ c for c in bcg_clusters if c.richness >= richness ]) # select only clusters >= richness final_clusters = [] # N(0.5) and galaxy overdensity print("Selecting appropriate clusters") for center in bcg_clusters: center.N = find_number_count(center, center.galaxies, distance=0.5 * u.Mpc / u.littleh) # find number count N(0.5) center.D = center_overdensity(center, galaxy_data, max_velocity) # find overdensity D # Initialize the cluster only if N(0.5) >= 8 and D >= overdensity if ((center.N >= 8) and (center.richness >= richness) and (center.D >= overdensity)): final_clusters.append(center) return final_clusters
def perform_recursive_clustering(cluster_collection, startAt): """ Performs a recursive clustering on a list of clusters given via cluster_collection. The recursion is performed according to the Discoverer paper by Cui et al. At first new number of distinct values for each token are calculated in each cluster and if this number is lower than a configurable number, the token is considered a FD. Then the number of subclusters that would be generated is calculated. If these subclusters contain at least one cluster containing more than a configurable amount of messages, the clustering is performed and the token is considered a FD. Then the recursion is performed on each of the new clusters with the next token. """ # Scan for FD token, Phase 1 clusters = cluster_collection.get_all_cluster( )[:] # <-- "[:]" Very very important... otherwise our iterated list will change because of deletions... # Save startAt information over cluster iteration __startAt = startAt for cluster in clusters: if Globals.getConfig().debug: print "Starting processing for next cluster ({0} messages)".format( len(cluster.get_messages())) startAt = __startAt #tokenValue = token.get_token() # Check distinct number of values of token foundFD = False maxTokenIdx = len(cluster.get_messages()[0].get_tokenlist()) while not foundFD and startAt < maxTokenIdx: l = [] #print "Analyzing token %s" % startAt # Check whether this might be a length token if "lengthfield" in set(cluster.get_semantics_for_token(startAt)): # Current token is a length token. Do not treat as FD startAt += 1 continue if not Globals.getConfig().allowAdjacentFDs: if startAt > 0: if "FD" in set(cluster.get_semantics_for_token( startAt - 1)): # We have an adjacent FD print "Two adjacent FDs forbidden by configuration, skipping to next token" continue for message in cluster.get_messages(): l.append(message.get_tokenAt(startAt).get_token()) numOfDistinctValuesForToken = len(set(l)) if Globals.getConfig( ).minDistinctFDValues < numOfDistinctValuesForToken <= Globals.getConfig( ).maxDistinctFDValues: # FD candidate found # Check number of potential clusters sumUp = Counter(l) wouldCluster = False for key in sumUp.keys(): if sumUp.get(key) > Globals.getConfig( ).minimumClusterSize: # Minimum cluster size of at least one cluster wouldCluster = True break if wouldCluster: # Check if adjacent text/text FDs are allowed in text protocols if Globals.getProtocolClassification( ) == Globals.protocolText: if not Globals.getConfig().allowAdjacentTextFDs: if startAt > 0: # Check whether the previous one is a text FD (type text and no semantic numeric) if "FD" in set( cluster.get_semantics_for_token( startAt - 1)): if cluster.get_format( startAt - 1) == Message.typeText and ( cluster.get_format(startAt) == Message.typeText and ("numeric" not in cluster. get_semantics_for_token( startAt - 1))): print "Two adjacent text FDs forbidden by configuration, skipping to next token" continue # Create new cluster if Globals.getConfig().debug: print "Subcluster prerequisites fulfilled. Adding FD semantic, splitting cluster and entering recursion" # Senseless here: message.get_tokenAt(startAt).add_semantic("FD") cluster.add_semantic_for_token(startAt, "FD") newCollection = ClusterCollection() for key in sumUp.keys(): messagesWithValue = cluster.get_messages_with_value_at( startAt, key) newCluster = Cluster( messagesWithValue[0].get_tokenrepresentation(), "recursion") newCluster.setSplitpoint("{0}".format(startAt)) newCluster.add_messages(messagesWithValue) newCluster.add_semantic_for_token(startAt, "FD") newCollection.add_cluster(newCluster) if Globals.getConfig().debug: print "{0} sub clusters generated".format( len(sumUp.keys())) # Perform format inference on new cluster collection formatinference.perform_format_inference_for_cluster_collection( newCollection) semanticinference.perform_semantic_inference(newCollection) # Merge clusters with same format while newCollection.mergeClustersWithSameFormat(): pass # Perform needle wunsch # Edit 20120120 - not here #=========================================================== # cluster1 = newCollection.get_random_cluster() # cluster2 = newCollection.get_random_cluster() # format1 = cluster1.get_formats() # format2 = cluster2.get_formats() # needlewunsch.needlewunsch(format1, format2) # #=========================================================== # Perform recursive step perform_recursive_clustering(newCollection, startAt + 1) # Remove old parent cluster cluster_collection.remove_cluster(cluster) cluster_collection.add_clusters( newCollection.get_all_cluster()) foundFD = True else: pass #print "Subclustering prerequisites not fulfilled. Will not sub-cluster" startAt += 1 if Globals.getConfig().debug: print "Recursive clustering analysis for cluster finished"
import zmq import time import threading import json import socket from cluster import Cluster, Node from pprint import pprint def stop_all (): cluster.stop () time.sleep (1) ctx.term () def print_status (): pprint (cluster.status_report (), indent=4) ctx = zmq.Context (1) node = Node (5760, 5770, 'node3') cluster = Cluster (node, ctx) cluster.start ()
def extract(raw_txt, logger): c = Cleaner() cleaned_text_list = c.clean(raw_txt) logger.info('Done cleaning') logger.debug(len(cleaned_text_list)) logger.debug(cleaned_text_list) matrix_builder = MatrixBuilder() matrix = matrix_builder.build_sim_matrix(cleaned_text_list, logger) logger.info('Done building sim matrix') logger.debug('Dimensions: {}'.format(matrix.shape)) logger.debug(matrix) g = Grapher() pageranks = g.graph(matrix) logger.info('Generated graph and got pageranks') logger.debug(pageranks) total_doc_size = len(cleaned_text_list) if total_doc_size in range(0, 300): summary_length = int(0.4 * total_doc_size) elif total_doc_size in range(301, 800): summary_length = int(0.2 * total_doc_size) elif total_doc_size in range(801, 1500): summary_length = int(0.1 * total_doc_size) else: summary_length = int(0.05 * total_doc_size) top_ranked = nlargest(summary_length, pageranks, key=pageranks.get) top_ranked.sort() cl = Cluster() top_ranked = cl.splitIntoParagraph(top_ranked, 7.5) logger.debug(top_ranked) result = '' for paragraph in top_ranked: for key in paragraph: top_ranked_sentence = cleaned_text_list[key] result += '{}. '.format(top_ranked_sentence) result += '\n\n' try: del c del cleaned_text_list del matrix_builder del matrix del g del pageranks del total_doc_size del summary_length del top_ranked del cl del raw_txt except: pass return result
def setUp(self): print "### setUp ###" self.cluster = Cluster()
def set_gadget_snap_single(self, snapnr, path_to_snaphot, verbose=False): h = Cluster(None, verbose=verbose) h.name = "snap{0}".format(snapnr) h.time = snapnr * self.dt h.set_gadget_single_halo(snapnr, path_to_snaphot, verbose=verbose) setattr(self, h.name, h)
from iris_plant import IrisPlant parser = argparse.ArgumentParser() parser.add_argument('k', metavar='k', type=int, help='The number of clusters to be used') args = parser.parse_args() k = args.k clusters = [] plants = file_reader.readFile() centroids = random.sample(plants, k) # Inicia os clusters com centroids randômicos for i in range(0, k): cluster = Cluster(i) cluster.centroid = centroids[i] clusters.append(cluster) isConverging = False iterationsLimit = 1000 currentIteration = 0 while (currentIteration < iterationsLimit and not isConverging): for cluster in clusters: cluster.plants = [] for plant in plants: # O primeiro elemento representa o cluster mais próxima e o segundo # representa a distância até o centróide do mesmo
import threading from random import randrange import logging from monitor import send_state_update logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s', datefmt='%H:%M:%S', level=logging.INFO) from Candidate import Candidate, VoteRequest from Follower import Follower from Leader import Leader from cluster import Cluster, ELECTION_TIMEOUT_MAX cluster = Cluster() class TimerThread(threading.Thread): def __init__(self, node_id): threading.Thread.__init__(self) self.node = cluster[node_id] self.node_state = Follower(self.node) self.election_timeout = float( randrange(ELECTION_TIMEOUT_MAX / 2, ELECTION_TIMEOUT_MAX)) self.election_timer = threading.Timer(self.election_timeout, self.become_candidate) def become_leader(self): logging.info(f'{self} become leader and start to send heartbeat ... ') send_state_update(self.node_state, self.election_timeout)
def dbscan(self, data): self.init_params() self.data = data ## Setting up the plot fig = plt.figure() axis_proj = 'rectilinear' if self.dim > 2: axis_proj = '%dd' % self.dim ax = fig.add_subplot(111, projection=axis_proj) #default noise cluster noise = Cluster('Noise', self.dim) self.clusters.add(noise) for point in data: if point not in self.visited: self.visited.append(point) neighbour_pts = self.region_query(point) if len(neighbour_pts) < self.min_pts: noise.add_point(point) else: name = 'cluster-%d' % self.cluster_count new_cluster = Cluster(name, self.dim) self.cluster_count += 1 self.expand_cluster(new_cluster, point, neighbour_pts) if self.dim == 2: ax.scatter(new_cluster.get_X(), new_cluster.get_Y(), c=self.color[self.cluster_count % len(self.color)], marker='o', label=name) elif self.dim == 3: ax.scatter(new_cluster.get_X(), new_cluster.get_Y(), new_cluster.get_Z(), marker='o', c=self.color[self.cluster_count % len(self.color)], label=name) ax.hold(True) if len(noise.get_points()) != 0: if self.dim > 2: ax.scatter(noise.get_X(), noise.get_Y(), noise.get_Z(), marker='x', label=noise.name) else: ax.scatter(noise.get_X(), noise.get_Y(), marker='x', label=noise.name) print("Number of clusters found: %d" % self.cluster_count) ax.hold(False) ax.legend(loc='lower left') ax.grid(True) plt.title(r'DBSCAN Clustering', fontsize=18) plt.show()
def create_clusters(node_data): for node_name, node_tags in node_data: c = Cluster(ID=int(node_name)) c.add_node(node_name, node_tags) clusterglobals.clusters_list.append(c)
for j in range(len(nodes)): e = np.zeros((len(nodes), 1)) e[i] = 1 e[j] = -1 N[i][j] = vg * np.matrix.dot( np.matrix.dot(np.matrix.transpose(e), lPlus), e) # Initialize clusters clusters = [] prototypes = set() for i in range(numOfClusters): rand = np.random.randint(0, len(nodes) - 1) while rand in prototypes: rand = np.random.randint(0, len(nodes) - 1) prototypes.add(rand) clusters.append(Cluster(i, rand)) # Do labeling labels = np.zeros(len(nodes)) iteration = 0 while True: iteration += 1 if iteration > maxIterations: break print('Iteration ', iteration + 1, '/', maxIterations) changed = False # Allocation of the observations for i in range(len(nodes)): min = math.inf nearestCluster = None for cluster in clusters:
def ABHclustering(self, constraints, final_n_of_clusters, clusters=None): """ Main hierarhical clustering loop """ self.l.log("Creating transitive ML closure...") stevec = len(clusters) for x in constraints: if 'must-link' in x: #print("omejitev: ", x) kluc1 = self.getClusterID(x['point'][0], clusters) kluc2 = self.getClusterID(x['must-link'][0], clusters) #print(kluc1, " | " , kluc2," | ", stevec, kluc1 == kluc2) if kluc1 != kluc2: tocke = [] tocke.append(clusters[kluc1].points) tocke.append(clusters[kluc2].points) clusters.pop(kluc1) clusters.pop(kluc2) nov = Cluster(stevec) nov.update(kluc1, kluc2, 0, tocke) #TLE DEJ NOT clusters.update({stevec: nov}) stevec += 1 m = stevec self.l.log("Creating distance matrix....") self.distances = {} self.clusters = clusters stevec = 0 for c in self.clusters: print(self.clusters[c].points, self.clusters[c].clusterId) for p in self.clusters[c].points: stevec += 1 print(len(self.clusters), stevec) z = [(clusters[a].clusterId, clusters[b].clusterId) for a in self.clusters for b in self.clusters] for l in z: kljuc1 = str(l[0]) + " " + str(l[1]) kljuc2 = str(l[1]) + " " + str(l[0]) if l[0] != l[1]: if kljuc1 in self.distances: continue elif kljuc2 in self.distances: continue else: if self.linkage == "Ward": c = [] u = [] v = [] for p in self.clusters[l[0]].points: c.append(p.coords) u.append(p.coords) for r in self.clusters[l[1]].points: c.append(r.coords) v.append(r.coords) centroid_uv = np.average(c, axis=0) centroid_u = np.average(u, axis=0) centroid_v = np.average(v, axis=0) dist1 = 0 dist2 = 0 dist3 = 0 for point in c: if self.distance_type == "Cosine": dist1 += spatial.distance.cosine( centroid_uv, point)**2 elif self.distance_type == "Euclidean": dist1 += spatial.distance.euclidean( centroid_uv, point)**2 for point in u: if self.distance_type == "Cosine": dist2 += spatial.distance.cosine( centroid_u, point)**2 elif self.distance_type == "Euclidean": dist2 += spatial.distance.euclidean( centroid_u, point)**2 for point in v: if self.distance_type == "Cosine": dist3 += spatial.distance.cosine( centroid_v, point)**2 elif self.distance_type == "Euclidean": dist3 += spatial.distance.euclidean( centroid_v, point)**2 dist = dist1 - dist2 - dist3 self.distances.update({kljuc1: dist}) elif self.linkage == "Average": u = [(a, b) for a in self.clusters[l[0]].points for b in self.clusters[l[1]].points] dist = self.average_linkage(u) self.distances.update({kljuc1: dist}) else: print("Error creating distance matrix...") exit(1) self.l.log("Finding clusters...") ''' s = sorted(self.distances.items(), key=lambda x: x[1]) for k, v in s: print(k, v) ''' print("st. omejitev: ", len(constraints)) #print("clustri: ", self.clusters.keys()) self.Z = np.array([]) #n = len(self.points) #na začetku je vsak primer svoj cluster n = len(self.clusters) idZ = 0 stop_clustering = False while (n != final_n_of_clusters): #print("### ",n," ###") condition = True #clusters_checked = [] while condition: """ if len(clusters_checked) == len(self.clusters): print("Ni mozno nadaljne zruzevanje, ostalo je ",len(self.clusters)," clustrov.") break dist, pair = self.closest_clusters(clusters_checked) if(pair is None): stop_clustering = True break par = list() for el in pair: par.append(el) self.constraints = self.sort_constraints() #ali ima katerakoli tocka iz obeh clustrov ML, jo zdruzi in ponovno poisci najblizja clustra #ML_pair = self.check_must_link(constraints, self.clusters[par[0]].points) if ML_pair == -1: ML_pair = self.check_must_link(constraints, self.clusters[par[1]].points) if ML_pair != -1: par[0] = ML_pair[0] par[1] = ML_pair[1] condition = self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points) if condition: clusters_checked.append([par[0], par[1]]) dist = self.cluster_distance(par[0], par[1]) if stop_clustering: break #print("par: ", par, ", dist: ", round(dist,2), " ", len(self.clusters)) """ key = min(self.distances, key=self.distances.get) kljuc = key par = key.split(' ') par = [int(i) for i in par] dist = self.distances[kljuc] #print(" ->",key, " ", self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points)) if self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points): self.distances[kljuc] = sys.maxsize if dist == sys.maxsize: self.l.log( "ABHC cannot find clusters under those constraints..." ) return self.clusters print(" Cannot link:", par) else: break # print("--------------------") # print(par[0], par[1]) #print(self.distances.keys()) self.distances.pop(kljuc, None) self.izbrisi_razdalje(par[0]) self.izbrisi_razdalje(par[1]) #print(self.distances.keys()) tocke = [] tocke.append(self.clusters[par[0]].points) tocke.append(self.clusters[par[1]].points) #print("tocke: ", len(tocke)) novCluster = Cluster(m + idZ) novCluster.update(par[0], par[1], dist, tocke) novCluster.centroid = novCluster.calculateCentroid() self.clusters.pop(par[0]) self.clusters.pop(par[1]) self.clusters.update({(m + idZ): novCluster}) #print("clustri:") #print(self.clusters.keys()) #print("dodajam razdalje...") #print("NOV:" ,m+idZ) self.dodaj_razdalje(m + idZ) print("par: ", par, "dist: ", '%.08f' % dist) if idZ == 0: self.Z = [par[0], par[1], dist, novCluster.n] else: newrow = [par[0], par[1], dist, novCluster.n] self.Z = np.vstack([self.Z, newrow]) n = len(self.clusters) idZ += 1 #zapomni si primere, kateri so v drugi skupini kot v prejšni iteraciji. self.diff = [] clusters_checked = set() hm = 0 for cluster in self.clusters: val = -1 for point in self.clusters[cluster].points: hm += 1 if val < 0: val = self.prev_dict[point.reference] if val in clusters_checked: self.diff.append(point.reference) else: if val != self.prev_dict[point.reference]: self.diff.append(point.reference) clusters_checked.add(val) self.prev_dict = self.make_dict() print(len(self.diff)) print(sorted(self.diff)) print("stevilo primerov: ", hm) return self.clusters
def runAlgorithm(state: str, populationVar: float, compactnessLvl: str): globalPrecinctDict.clear() globalClusterList.clear() tempClusters.clear() clusterToPrecinctListDict.clear() globalNXGraph = nx.Graph() noValidEdgeCount = 0 precinctsJSONData = {} if state.lower() == myconstants.ARKANSAS or state.lower( ) == myconstants.ARKANSAS_ABBREVIATION: requestedNumDistricts = myconstants.ARKANSAS_NUM_DISTRICTS with open(myconstants.ARKANSAS_NEIGHBOR_FILENAME) as f: precinctsJSONData = json.load(f) #print("arkansas selected") elif state.lower() == myconstants.VIRGINIA or state.lower( ) == myconstants.VIRGINIA_ABBREVIATION: requestedNumDistricts = myconstants.VIRGINIA_NUM_DISTRICTS with open(myconstants.VIRGINIA_NEIGHBOR_FILENAME) as f: precinctsJSONData = json.load(f) #print("virginia selected") else: requestedNumDistricts = myconstants.SOUTHCAROLINA_NUM_DISTRICTS with open(myconstants.SOUTHCAROLINA_NEIGHBOR_FILENAME) as f: precinctsJSONData = json.load(f) #print("south carolina selected") for id in precinctsJSONData: globalPrecinctDict.update({id: Precinct(precinctsJSONData[id])}) globalPrecinctDict[id].setPrecinctID(id) # load initial cluster objects into list, initialize precinct neighborLists with objects instead of numbers for p in globalPrecinctDict.values(): neighborLst = [] for id in p.getPrecinctNeighbors(): neighborLst.append(globalPrecinctDict[id]) p.setPrecinctNeighbors(set(neighborLst)) globalClusterList.append( Cluster(p)) # create a cluster out of precinct # turn the precinct neighbor list into its corresponding cluster neighbor list for c in globalClusterList: correspondingClusterNeighborList = [] for p in c.getClusterNeighborsList(): for c1 in globalClusterList: if p.getPrecinctID() == c1.getClusterID(): correspondingClusterNeighborList.append(c1) c.setClusterNeighborsList(correspondingClusterNeighborList) # set up job params idealPopulation = calcIdealPopulation() populationVariance = populationVar compactness = compactnessLvl # possible compactness levels: not, somewhat, very, extremely # try with contracted_nodes and nx for c in globalClusterList: globalNXGraph.add_node(c.getClusterID()) edgeList = [] for c in globalClusterList: for n in c.getClusterNeighborsList(): if c.getClusterID() is not n.getClusterID(): edgeList.append(tuple((c.getClusterID(), n.getClusterID()))) edgeList = list(set(tuple(sorted(edge)) for edge in edgeList)) globalNXGraph.add_edges_from(edgeList) for c in globalClusterList: clusterToPrecinctListDict.update( {c.getClusterID(): [c.getClusterID()]}) # UC29: Generate Seed Districting # Merge random clusters until there are {requestedNumDistricts} clusters left #currNumClusters = len(globalClusterList) currNumClusters = len(list(globalNXGraph.nodes)) while (currNumClusters > requestedNumDistricts): randomNode = random.sample(list(globalNXGraph.nodes), 1) randomNode = randomNode[0] if list(globalNXGraph.adj[randomNode]): randomNodeNeighbor = random.sample( list(globalNXGraph.adj[randomNode]), 1) randomNodeNeighbor = randomNodeNeighbor[0] globalNXGraph = nx.contracted_nodes(globalNXGraph, randomNode, randomNodeNeighbor, self_loops=False) precinctList1 = clusterToPrecinctListDict[randomNode] precinctList2 = clusterToPrecinctListDict[randomNodeNeighbor] clusterToPrecinctListDict[ randomNode] = precinctList1 + precinctList2 del clusterToPrecinctListDict[randomNodeNeighbor] else: continue currNumClusters = len(list(globalNXGraph.nodes)) currIterationCount = 1 start_time = time.time() # keep iterating until all clusters are acceptable or until we've hit our iteration limit while currIterationCount <= numIterations: #or allClustersAcceptableCheck(): # UC30: Generate a random districting satisfying constraints # Combine the two sub-graphs to form a new sub-graph of simple nodes. #print() #print("ITERATION {}".format(currIterationCount)) #print("Picking two random clusters to merge...") randomNode = random.sample(list(globalNXGraph.nodes), 1) randomNode = randomNode[0] randomNodeNeighbor = random.sample(list(globalNXGraph.adj[randomNode]), 1) randomNodeNeighbor = randomNodeNeighbor[0] #randomCluster = random.sample(globalClusterList, 1) # random.sample returns a list #randomClusterNeighbor = random.sample(randomCluster[0].getClusterNeighborsList(), 1) #print("Merging {} and {}".format(randomCluster[0], randomClusterNeighbor[0])) #print() tempClusters.clear( ) # update tempvar to keep track of currently merged clusters tempClusters.append(randomNode) tempClusters.append(randomNodeNeighbor) #print(tempClusters) combinedCluster = combineClusters(randomNode, randomNodeNeighbor) #print("Precincts: ") #print(stringifyList(combinedCluster.getClusterPrecinctsList())) #print(len(combinedCluster.getClusterPrecinctsList())) #print("Edges: ") #print(combinedCluster.getClusterEdgeList()) #print(len(combinedCluster.getClusterEdgeList())) # UC31: Generate a spanning tree of the combined sub-graph above #print() #print("Generating a spanning tree...") stEdgeList = generateSpanningTree(combinedCluster) #print(stringifyList(stEdgeList)) # UC33: Generate a feasible set of edges in the spanning tree to cut # UC32: Calculate the acceptability of each newly generated sub-graph #validEdgeList = findValidEdges(stEdgeList, idealPopulation, populationVar, compactness) validEdge = findValidEdge(stEdgeList, idealPopulation, populationVar, compactness) #print() #print("Generated valid edge list...") #print(len(validEdgeList)) #print(validEdgeList) #print() #if not validEdgeList: if not validEdge: #print("No valid edges, moving onto next iteration.") noValidEdgeCount = noValidEdgeCount + 1 pass else: # UC34: Cut the edge in the combined sub-graph #print("Choosing a random edge from valid edge list...") #randomValidEdge = random.sample(validEdgeList, 1) # random.sample returns a list #randomValidEdge = randomValidEdge[0] #print("Chosen edge: {}".format(randomValidEdge)) #print() #print("Resulting new clusters from edge cut...") #newlyCreatedClusters = cutEdge(stEdgeList, randomValidEdge, globalNXGraph) newlyCreatedClusters = cutEdge(stEdgeList, validEdge, globalNXGraph) del clusterToPrecinctListDict[tempClusters[0]] del clusterToPrecinctListDict[tempClusters[1]] clusterToPrecinctListDict.update( {newlyCreatedClusters[0][0]: newlyCreatedClusters[0]}) clusterToPrecinctListDict.update( {newlyCreatedClusters[1][0]: newlyCreatedClusters[1]}) # print cluster - population - num precincts ''' print("Merged and Cut Clusters") print(tempClusters[0]) print(tempClusters[1]) print("Results:") print(list(globalNXGraph.nodes)) tempList = [] for id in clusterToPrecinctListDict: tempList.append(len(clusterToPrecinctListDict[id])) print(tempList) tempList.clear() for id in clusterToPrecinctListDict: tempList.append(getClusterTotalPopulation(id)) print(tempList) tempList.clear() for id in clusterToPrecinctListDict: print(list(globalNXGraph.adj[id])) print() ''' currIterationCount = currIterationCount + 1 # increment counter returnPlan = [] for id in clusterToPrecinctListDict: returnPlan.append(clusterToPrecinctListDict[id]) #print("No valid edge times: {}". format(noValidEdgeCount)) return returnPlan
if sim_info['state']['wrf'] != 'completed': sim_state = get_simulation_state(sim_info['log_file']) sim_info['state'] = sim_state sim_info['last_updated'] = to_esmf(datetime.now()) json.dump(sim_info, open('simulations/' + sim_id + '.json', 'w')) return json.dumps(sim_state) @app.route("/remove_sim/<sim_id>") def remove_sim(sim_id=None): if sim_id is not None: if sim_id in simulations: del simulations[sim_id] os.remove('simulations/' + sim_id + '.json') return "OK" else: return "NotFound" @app.route("/all_sims") def get_all_sims(): return json.dumps(simulations, indent=4, separators=(',', ': ')) if __name__ == '__main__': profiles = load_profiles() cluster = Cluster(json.load(open('etc/cluster.json'))) wrfxpy = json.load(open('etc/wrfxpy.json')) simulations = load_simulations() app.run(debug=True)
def hierarhicalClustering(self, clusters=None): """ Main hierarhical clustering loop """ distanca = 0 self.l.log("Building distance matrix...") n = len(self.points) #na začetku je vsak primer svoj cluster data = [] for c in self.clusters: p = [point.coords for point in self.clusters[c].points] data.append(p[0]) df = pd.DataFrame(data, columns=np.array([a for a in self.attributes])) n_df = (df.values) self.d_matrix = np.zeros(((df.values).shape[0], (df.values).shape[0])) for i in range((df.values).shape[0]): for j in range((df.values).shape[0]): kljuc1 = str(i) + ' ' + str(j) kljuc2 = str(j) + ' ' + str(i) if i != j: if kljuc1 in self.distances: continue elif kljuc2 in self.distances: continue else: if self.linkage == "Ward": l = [] l.append(n_df[i]) l.append(n_df[j]) centroid = np.average(l, axis=0) dist = 0 if self.distance_type == "Cosine": dist += spatial.distance.cosine( centroid, n_df[i])**2 dist += spatial.distance.cosine( centroid, n_df[j])**2 elif self.distance_type == "Euclidean": dist += spatial.distance.euclidean( centroid, n_df[i])**2 dist += spatial.distance.euclidean( centroid, n_df[i])**2 self.distances.update({kljuc1: dist}) elif self.linkage == "Average": if self.distance_type == "Cosine": dist = spatial.distance.cosine( n_df[i], n_df[j]) elif self.distance_type == "Euclidean": dist = spatial.distance.euclidean( n_df[i], n_df[j]) self.distances.update({kljuc1: dist}) else: print("Error creating distance matrix...") exit(1) idZ = 0 m = len(self.points) self.l.log("Finding clusters...") while n > 1: """ dist, pair = self.closest_clusters() par = list() for el in pair: par.append(el) dist = np.amin(self.d_matrix) result = np.where(self.d_matrix == dist) par = list() for el in result[0]: par.append(el) print("--",par) """ key = min(self.distances, key=self.distances.get) par = key.split(' ') par = [int(i) for i in par] dist = self.distances[key] #print("--------------------") #print(par[0], par[1]) self.distances.pop(key, None) self.izbrisi_razdalje(par[0]) self.izbrisi_razdalje(par[1]) #print("5 238" in self.distances) tocke = [] tocke.append(self.clusters[par[0]].points) tocke.append(self.clusters[par[1]].points) #print("tocke: ", len(tocke)) novCluster = Cluster(m + idZ) novCluster.update(par[0], par[1], dist, tocke) novCluster.centroid = novCluster.calculateCentroid() self.clusters.pop(par[0]) self.clusters.pop(par[1]) self.clusters.update({(m + idZ): novCluster}) #print("dodajam razdalje...") self.dodaj_razdalje(m + idZ) """ novCluster = Cluster(par[0]) novCluster.update(par[0], par[1], dist, tocke) novCluster.centroid = novCluster.calculateCentroid() self.clusters.pop(par[0]) self.clusters.pop(par[1]) self.clusters.update({(par[0]): novCluster}) #TODO: preracunaj razdalje v matriki razdalj """ if idZ == 0: self.Z = [par[0], par[1], dist, novCluster.n] else: newrow = [par[0], par[1], dist, novCluster.n] self.Z = np.vstack([self.Z, newrow]) n = len(self.clusters) #self.vseSilhuete.update({idZ: self.metodaSilhuet()}) print("par: ", par, ", dist: ", '%.08f' % dist) #print(idZ, n, m+idZ) idZ += 1 self.l.log("Dendrogram created...") #vrnil naj bi matriko Z, in rezultate metod, ki nam povejo koliko clustrov je #print("Optimalno stevilo clustrov po metodi silhuet: ", len(self.points)-1-max(self.vseSilhuete.items(), key=operator.itemgetter(1))[0]) return self.clusters
def new_cluster(x, frac, fac, p, m): k = 1 inv_cov = get_inv_cov(x, frac) S = [x.tolist()] return Cluster(centroid=x, inv_cov=inv_cov, k=k, S=S)
def CAclustering(self, constraints, final_n_of_clusters, clusters=None): """ Main hierarhical clustering loop """ self.l.log("Creating transitive ML closure...") stevec = len(clusters) for c in clusters: print(clusters[c].clusterId, clusters[c].points) for x in constraints: if 'must-link' in x: print("omejitev: ", x) kluc1 = self.getClusterID(x['point'][0], clusters) kluc2 = self.getClusterID(x['must-link'][0], clusters) print(kluc1, " | ", kluc2, " | ", stevec, kluc1 == kluc2) if kluc1 != kluc2: tocke = [] tocke.append(clusters[kluc1].points) tocke.append(clusters[kluc2].points) clusters.pop(kluc1) clusters.pop(kluc2) nov = Cluster(stevec) nov.update(kluc1, kluc2, 0, tocke) # TLE DEJ NOT clusters.update({stevec: nov}) stevec += 1 m = stevec self.l.log("Creating distance matrix....") self.distances = {} self.clusters = clusters stevec = 0 for c in self.clusters: print(self.clusters[c].points, self.clusters[c].clusterId) for p in self.clusters[c].points: stevec += 1 print(len(self.clusters), stevec) z = [(clusters[a].clusterId, clusters[b].clusterId) for a in self.clusters for b in self.clusters] for l in z: kljuc1 = str(l[0]) + " " + str(l[1]) kljuc2 = str(l[1]) + " " + str(l[0]) if l[0] != l[1]: if kljuc1 in self.distances: continue elif kljuc2 in self.distances: continue else: if self.linkage == "Ward": c = [] u = [] v = [] for p in self.clusters[l[0]].points: c.append(p.coords) u.append(p.coords) for r in self.clusters[l[1]].points: c.append(r.coords) v.append(r.coords) centroid_uv = np.average(c, axis=0) centroid_u = np.average(u, axis=0) centroid_v = np.average(v, axis=0) dist1 = 0 dist2 = 0 dist3 = 0 for point in c: if self.distance_type == "Cosine": dist1 += spatial.distance.cosine( centroid_uv, point)**2 elif self.distance_type == "Euclidean": dist1 += spatial.distance.euclidean( centroid_uv, point)**2 for point in u: if self.distance_type == "Cosine": dist2 += spatial.distance.cosine( centroid_u, point)**2 elif self.distance_type == "Euclidean": dist2 += spatial.distance.euclidean( centroid_u, point)**2 for point in v: if self.distance_type == "Cosine": dist3 += spatial.distance.cosine( centroid_v, point)**2 elif self.distance_type == "Euclidean": dist3 += spatial.distance.euclidean( centroid_v, point)**2 dist = dist1 - dist2 - dist3 self.distances.update({kljuc1: dist}) elif self.linkage == "Average": u = [(a, b) for a in self.clusters[l[0]].points for b in self.clusters[l[1]].points] dist = self.average_linkage(u) self.distances.update({kljuc1: dist}) else: print("Error creating distance matrix...") exit(1) self.l.log("Finding clusters...") ''' s = sorted(self.distances.items(), key=lambda x: x[1]) for k, v in s: print(k, v) ''' print("st. omejitev: ", len(constraints)) # print("clustri: ", self.clusters.keys()) self.Z = np.array([]) # n = len(self.points) #na začetku je vsak primer svoj cluster n = len(self.clusters) idZ = 0 stop_clustering = False while (n != final_n_of_clusters): # print("### ",n," ###") condition = True # clusters_checked = [] while condition: key = min(self.distances, key=self.distances.get) kljuc = key par = key.split(' ') par = [int(i) for i in par] dist = self.distances[kljuc] # print(" ->",key, " ", self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points)) if self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points): self.distances[kljuc] = sys.maxsize if dist == sys.maxsize: self.l.log( "ABHC cannot find clusters under those constraints..." ) return self.clusters print(" Cannot link:", par) else: break # print("--------------------") # print(par[0], par[1]) # print(self.distances.keys()) self.distances.pop(kljuc, None) self.izbrisi_razdalje(par[0]) self.izbrisi_razdalje(par[1]) # print(self.distances.keys()) tocke = [] tocke.append(self.clusters[par[0]].points) tocke.append(self.clusters[par[1]].points) # print("tocke: ", len(tocke)) novCluster = Cluster(m + idZ) novCluster.update(par[0], par[1], dist, tocke) novCluster.centroid = novCluster.calculateCentroid() self.clusters.pop(par[0]) self.clusters.pop(par[1]) self.clusters.update({(m + idZ): novCluster}) # print("clustri:") # print(self.clusters.keys()) # print("dodajam razdalje...") # print("NOV:" ,m+idZ) self.dodaj_razdalje(m + idZ) print("par: ", par, "dist: ", '%.08f' % dist) if idZ == 0: self.Z = [par[0], par[1], dist, novCluster.n] else: newrow = [par[0], par[1], dist, novCluster.n] self.Z = np.vstack([self.Z, newrow]) n = len(self.clusters) idZ += 1 return self.clusters
sys.path.append(src_path) from config import Config from preprocessor import Preprocessor from points import Points from cluster import Cluster from transfer_network import TransferNetwork from transfer_probability import TransferProbability from most_popular_route import MostPopularRoute from figure import Figure # get points from trajectories preprocessor = Preprocessor( Config.DATASET_ROOT_DIR, Config.DATASET_SCALE) points = preprocessor.get_points() # use coherence expanded algorithm to form clusters clusters = Cluster(points).coherence_expanding() network = TransferNetwork(points, clusters) # derive transfer probability tp = TransferProbability(network) tp.derive() # search the most popular route mpr = MostPopularRoute(network) route = mpr.search(0, 6) print(route) figure = Figure() figure.most_popular_route(points, network, route).show()