Beispiel #1
0
def main():
    print(
        'Start Scenario 1, Kmean Clustering with semi-labeled data\nUsing jieba syntactic, jieba semantic features ...'
    )
    test_cluster = cluster.Cluster()
    test_cluster.syntactic_analysis('jieba')
    test_cluster.semantic_analysis()
    test_cluster.write_syntactic_feature('syntactic_jieba.pickle')
    _, result_1_syntactic = test_cluster.evaluate('Syntactic',
                                                  'result_1_syntactic.pickle')
    _, result_1_semantic = test_cluster.evaluate('Semantic',
                                                 'result_1_semantic.pickle')
    _, result_1_both = test_cluster.evaluate('Syntactic_Semantic',
                                             'result_1_both.pickle')

    # print('Start Scenario 2, Kmean Clustering with semi-labeled data\nUsing ckip syntactic, ckip semantic features ...')
    # test_cluster2 = cluster.Cluster()
    # test_cluster2.syntactic_analysis('ckip')
    # _, result_2_syntactic = test_cluster2.evaluate('Syntactic', 'result_2_syntactic.pickle')
    # _, result_2_semantic = test_cluster2.evaluate('Semantic', 'result_2_semantic.pickle')
    # _, result_2_both = test_cluster2.evaluate('Syntactic_Semantic', 'result_2_both.pickle')

    print(
        'Start Scenario 3, Kmean Clustering with semi-labeled data\nUsing jieba syntactic, ckip semantic features ...'
    )
    test_cluster3 = cluster.Cluster()
    test_cluster3.syntactic_analysis('jieba')
    test_cluster3.semantic_analysis()
    _, result_3_both = test_cluster3.evaluate('Syntactic_Semantic',
                                              'result_3_both.pickle')
Beispiel #2
0
def test_features_syn():
	docs = doc.get_docs_nested(get_data_dir(sys.argv[2]))
	max_size = int(sys.argv[3])
	num_combine = int(sys.argv[4])
	min_size = int(sys.argv[5])

	d = collections.defaultdict(list)
	for _doc in docs:
		d[_doc.label].append(_doc)
	pure_clusters = d.values()
	broken_clusters = list()
	for x in xrange(10):
		for _cluster in pure_clusters:
			broken_clusters += [_cluster[i:i + max_size] for i in range(0, len(_cluster), max_size)]
		combined_clusters = list()
		while broken_clusters:
			if len(broken_clusters) < num_combine:
				clusters = list(broken_clusters)
			else:
				clusters = random.sample(broken_clusters, num_combine)
			for _cluster in clusters:
				broken_clusters.remove(_cluster)
			combined_clusters.append(utils.flatten(clusters))

		clusters = map(lambda combined_cluster: cluster.Cluster(combined_cluster), combined_clusters)
		ncluster.test_features(clusters, min_size)
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """

    # position initial clusters at the location of clusters with largest populations
    clusters = [cluster1 for cluster1 in cluster_list]
    clusters.sort(key = lambda x:x.total_population(), reverse=True)
    clusters = clusters[:num_clusters]
    
    for _ in range(num_iterations):
        # num_iterations == q
        # initalize num_clusters i.e k empty cluster
        empty_cluster = [cluster.Cluster(set([]), 0, 0, 0, 0) for _ in range(num_clusters)]
        
        for jdx in range(len(cluster_list)):
            distance, merge_with = float('inf'), None
            for cluster1 in clusters:
                if cluster_list[jdx].distance(cluster1) < distance:
                    distance, merge_with = cluster_list[jdx].distance(cluster1), cluster1
            
            empty_cluster[clusters.index(merge_with)].merge_clusters(cluster_list[jdx])
        # new_clusters[.index(closest_cluster_center)].merge_clusters(county)
        clusters = empty_cluster
    return clusters
Beispiel #4
0
    def __init__(self, options):
        self.options = options

        # We initialize the scheduling module
        print "Using the platform configuration file %s" % self.options.file
        rc = py_lat_module.lat_module_init ("verbose",
                                            "1",
                                            "ini_config_file",
                                            self.options.file)
        if (rc != 0):
            print "ERROR: lat_module_init() failed (ret: %d)\n" % rc
        print "Success.\n"

        print "Initializing the AFE scheduler..."
        rc = py_lat_module.lat_device_sched_init ();
        if (rc != 0):
            print "ERROR: lat_device_sched_init() failed (ret: %d)\n" % rc
        print "Success.\n";

        print "Initializing the host scheduler..."
        rc = py_lat_module.lat_host_sched_init ();
        if (rc != 0):
            print "ERROR: lat_device_sched_init() failed (ret: %d)\n" % rc
        print "Success.\n";

        print "Initializing the meta scheduler..."
        rc = py_lat_module.lat_meta_sched_init ();
        if (rc != 0):
            print "ERROR: lat_meta_sched_init() failed (ret: %d)\n" % rc
        print "Success.\n"

        self.py_lat_module = py_lat_module

        # Setup the virtual platform
        self.cluster = cluster.Cluster(options)
Beispiel #5
0
    def __init__(self, nClusters, nTotalUnits, Ti, Tn, NBin_nEntries, ZF):

        #schedule
        self.wakeQ = SortedDict()
        self.now = 0

        self.ZF = ZF
        self.VERBOSE = op.verboseDirector
        self.nClusters = nClusters
        self.Tn = Tn  # it is used when assigning filters to clusters
        self.nUnitsCluster = nTotalUnits / nClusters

        #components
        self.centralMem = simpleMemory.SimpleMemory(self, op.CM_size,
                                                    op.CM_nPorts,
                                                    op.CM_bytesCyclePort)
        self.clusters = []
        self.coordsWindow = {}
        self.clustersProcWindow = {
        }  # [windowID] -> count of clusters processing this window
        self.filtersPending = {}
        self.clustersReadingWindow = {}
        self.output = []
        for i in range(nClusters):
            self.clusters.append(
                cluster.Cluster(self, i, self.nUnitsCluster, Ti, Tn,
                                NBin_nEntries, op.SB_size_per_cluster,
                                self.cbClusterDoneReading, self.cbClusterDone))
Beispiel #6
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_3108_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                            line[4]))

    cluster_list = sequential_clustering(singleton_list, 15)
    print("Displaying " + str(len(cluster_list)) + " sequential clusters")

    #cluster_list = algos.hierarchical_clustering(singleton_list, 9)
    #print "Displaying", len(cluster_list), "hierarchical clusters"

    #cluster_list = algos.kmeans_clustering(singleton_list, 9, 5)
    #print "Displaying", len(cluster_list), "k-means clusters"

    # draw the clusters using matplotlib or simplegui
    if DESKTOP:
        plot.plot_clusters(data_table, cluster_list, False)
        #plot.plot_clusters(data_table, cluster_list, True)  #add cluster centers
    else:
        alg_clusters_simplegui.PlotClusters(
            data_table,
            cluster_list)  # use toggle in GUI to add cluster centers
Beispiel #7
0
def main():
    if (len(sys.argv) == 2 and sys.argv[1] == "spec_help"):
        spechelp = spec_help.SpecHelp()
        spechelp.express()
        uu = user_usage.UserUsage()
        uu.log_command("spec_help", '')
        exit()

    if (len(sys.argv) < 3 or len(sys.argv) > 5):
        usage()
        exit()
    cspec_path = sys.argv[1]
    permute_command = sys.argv[2]
    if len(sys.argv) == 3:
        scope = ''
        debug = False
    elif len(sys.argv) == 4:
        if sys.argv[3] == '-debug':
            scope = ''
            debug = True
        else:
            scope = sys.argv[3]
            debug = False
    else:  #len(sys.argv) == 5
        if (sys.argv[4] != '-debug'):
            usage()
        else:
            scope = sys.argv[3]
            debug = True

    #real_cluster_system = cluster_system.ClusterSystem()
    if (permute_command == "new_spec"):
        cluster_spec.generate_new_spec(cspec_path)
        uu = user_usage.UserUsage()
        uu.log_command(permute_command, '')
        exit()

    validate_args(permute_command, scope)
    validate_cspec_is_cspec(cspec_path)

    f = open(cspec_path, 'r')
    cspec_lines = f.readlines()
    f.close()
    true_stdout = stdout.Stdout()
    validated, missing_optionals = cluster_spec.validate(
        cspec_lines, true_stdout)
    if not (validated):
        exit()
    cspec = cluster_spec.ClusterSpec(cspec_path, cspec_lines, true_stdout,
                                     missing_optionals, True, debug)
    cluster_runs = cluster_runs_info.ClusterRunsInfo(cspec, true_stdout)
    hp_cluster = cluster.Cluster(cluster_runs, true_stdout)
    pdriver = permutation_driver.PermutationDriver(cspec_lines, cspec_path,
                                                   true_stdout, hp_cluster)
    uu = user_usage.UserUsage()
    uu.log_command(permute_command, scope)

    pdriver.run_command(permute_command, scope)
Beispiel #8
0
def clusterephem(clustername):
    c = cluster.Cluster('NGC 2236')
    # print c.coordinatestring
    coords = ','.join(c.coordinatestring)
    coords = coords.replace(' ', ':')
    mag = c.solarmagnitude
    diam = c['diam'] * 60
    result = '%s,f|O,%s,%.1f,2000,%i' % (clustername, coords, mag, diam)
    return result
Beispiel #9
0
def gen_random_clusters(num_clusters):
    """
    Return a list of clusters with centers corresponding to points of the unit
    square.
    """
    return [
        cluster.Cluster(set([]), uniform(-1, 1), uniform(-1, 1),
                        randrange(1, 101), randrange(1, 101))
        for _ in range(num_clusters)
    ]
def kmeans(examples, k, verbose=False):
    """
    :param examples:  样本 类型Example
    :param k: k表示k个聚类中心
    :param verbose: 冗长的意思,这里类比 to_print
    :return: k-mean的结果
    """
    # Get k randomly chosen initial centroids, create cluster for each
    initialCentroids = random.sample(examples, k)
    clusters = []
    # 依据k个initialCentroids,创建k个Cluster(每个Cluster的点暂时只有一个,即对应的initialCentroid
    for e in initialCentroids:
        clusters.append(cluster.Cluster([e]))

    # Iterate until centroids do not change
    converged = False
    numIterations = 0
    while not converged:
        numIterations += 1
        # Create a list containing k distinct empty lists
        newClusters = []
        for i in range(k):
            newClusters.append([])

        # Associate each example with closest centroid
        for e in examples:
            # Find the centroid closest to e
            # 假设e与第一个Cluster的聚类中心的距离是最短的,然后根据实际情况更新
            smallestDistance = e.distance(clusters[0].getCentroid())
            index = 0
            for i in range(1, k):
                distance = e.distance(clusters[i].getCentroid())
                if distance < smallestDistance:
                    smallestDistance = distance
                    index = i
            # Add e to the list of examples for appropriate cluster
            newClusters[index].append(e)

        for c in newClusters:  # Avoid having empty clusters
            if len(c) == 0:
                raise ValueError('Empty Cluster')

        # Update each cluster; check if a centroid has changed
        converged = True
        for i in range(k):
            # 第i个更新examples和聚类中心,并返回旧的聚类中心和新的聚类中心的距离
            # 如果该距离大于0,表示聚类中心发生变化,根据k-mean算法,继续执行一次while循环,直到converged = True,即聚集完成
            if clusters[i].update(newClusters[i]) > 0.0:
                converged = False
        if verbose:
            print('Iteration #' + str(numIterations))
            for c in clusters:
                print(c)
            print('')  # add blank line
    return clusters
def main(argv=sys.argv, json_writer=simple_json_writer):  # pragma: no cover
    try:
        
        global logger
        provider_config, logger, fine = util.provider_config_from_environment()
        
        data_dir = os.getenv('PRO_DATA_DIR', os.getcwd())
        hostnamer = util.Hostnamer(provider_config.get("cyclecloud.hostnames.use_fqdn", True))
        cluster_name = provider_config.get("cyclecloud.cluster.name")
        
        provider = CycleCloudProvider(config=provider_config,
                                      cluster=cluster.Cluster(cluster_name, provider_config, logger),
                                      hostnamer=hostnamer,
                                      json_writer=json_writer,
                                      terminate_requests=JsonStore("terminate_requests.json", data_dir),
                                      templates=JsonStore("templates.json", data_dir, formatted=True),
                                      clock=true_gmt_clock)
        provider.fine = fine

        # every command has the format cmd -f input.json        
        cmd, ignore, input_json_path = argv[1:]

        input_json = util.load_json(input_json_path)
        
        if provider.fine:
            logger.debug("Arguments - %s %s %s", cmd, ignore, json.dumps(input_json))
                
        if cmd == "templates":
            provider.templates()
        elif cmd == "create_machines":
            provider.create_machines(input_json)
        elif cmd in ["status", "create_status", "terminate_status"]:
            if "requests" in input_json:
                # provider.status handles both create_status and deprecated terminate_status calls.
                provider.status(input_json)
            elif cmd == "terminate_status":
                # doesn't pass in a requestId but just a list of machines.
                provider.terminate_status(input_json)
            else:
                # should be impossible
                raise RuntimeError("Unexpected input json for cmd %s" % (input_json, cmd))
        elif cmd == "get_return_requests":
            provider.get_return_requests(input_json)
        elif cmd == "terminate_machines":
            provider.terminate_machines(input_json)
            
    except ImportError as e:
        logger.exception(unicode(e))

    except Exception as e:
        if logger:
            logger.exception(unicode(e))
        else:
            import traceback
            traceback.print_exc()
Beispiel #12
0
 def __init__(self):
     cfg = yaml.safe_load(open("config.yaml"))
     mongourl = cfg['mongourl']
     database = cfg['database']
     collection = cfg['collection']
     collection_new = cfg['collection_new']
     self.dataset_path = cfg['dataset_path']
     self.temp_path = cfg['temp_path']
     self.cluster = cluster.Cluster()
     self.vec = vectorize.Vectors()
     self.db = db_helper.Db(mongourl, database, collection)
     self.db_new = db_helper.Db(mongourl, database, collection_new)
Beispiel #13
0
 def get_cluster(self, name=None):
     """Return all clusters under this datacenter as a list of
     cluster objects or given cluster as cluster object"""
     objs = []
     for obj in self.mor.hostFolder.childEntity:
         if obj.__class__.__name__ != "vim.ClusterComputeResource":
             continue
         if not name or name == obj.name:
             objs.append(cluster.Cluster(self._server, self, obj))
             if name and name == obj.name:
                 break
     if name and len(objs):
         objs = objs[0]
     return objs
Beispiel #14
0
def kmeans(examples, k, verbose = False):
    #Get k randomly chosen initial centroids, create cluster for each
    initialCentroids = random.sample(examples, k)
    clusters = []
    for e in initialCentroids:
        clusters.append(cluster.Cluster([e]))
        
    #Iterate until centroids do not change
    converged = False
    numIterations = 0
    while not converged:
        numIterations += 1
        #Create a list containing k distinct empty lists
        newClusters = []
        for i in range(k):
            newClusters.append([])
            
        #Associate each example with closest centroid
        for e in examples:
            #Find the centroid closest to e
            smallestDistance = e.distance(clusters[0].getCentroid())
            index = 0
            for i in range(1, k):
                distance = e.distance(clusters[i].getCentroid())
                if distance < smallestDistance:
                    smallestDistance = distance
                    index = i
            #Add e to the list of examples for appropriate cluster
            newClusters[index].append(e)
            
        for c in newClusters: #Avoid having empty clusters
            if len(c) == 0:
                try:
                    newClusters.remove(c)
                except:     
                    raise ValueError('Empty Cluster')
        
        #Update each cluster; check if a centroid has changed
        converged = True
        for i in range(k):
            if clusters[i].update(newClusters[i]) > 0.0:
                converged = False
        if verbose:
            print('Iteration #' + str(numIterations))
            for c in clusters:
                print(c)
            print('') #add blank line
    return clusters
Beispiel #15
0
    def run(self, max_clusters, distances):
        """
        This method runs the agglomerative clustering algorithm until we reach max clusters
        :param max_clusters: max clusters in list clusters
        :param distances: matrix of calculated distances
        :return: our_cluster: list of sorted clusters (by id)
        """
        for sample in self.samples:
            our_cluster = cluster.Cluster(sample.id, set([sample]))
            self.clusters.append(our_cluster)

        while (len(self.clusters) > max_clusters):
            merging = self.min_in_distances(distances)
            self.update_clusters(merging[0], merging[1])
        our_clusters = self.sort_by_id()
        return our_clusters
Beispiel #16
0
def form_clusters(instances, assignments):
    '''
	Takes a list of instances and assignments and returns 
		a list of Cluster objects with Mocked centers
	'''
    cluster_map = dict()
    m = MockCenter()
    m.label = None
    for x in xrange(assignments.max() + 1):
        cluster_map[x] = cluster.Cluster(list(), m, x)
    for instance, assignment in zip(instances, assignments):
        cluster_map[assignment].members.append(instance)
    clusters = cluster_map.values()
    clusters = filter(lambda c: len(c.members), clusters)
    map(lambda _cluster: _cluster.set_label(), clusters)
    return clusters
    def read_clusters_sorted_by_time(file_name, named_entities_file_name):
        tweets = []
        clusters = {}
        clusters_timeline = []
        named_entities = {}
        file_to_read = open(file_name, "r")

        file_entities = open(named_entities_file_name, 'r')

        for line in file_entities.readlines():
            data = line.split(",", 1)
            named_entities[data[0]] = data[1]

        for line in file_to_read.readlines():
            data = line.split(",")
            tokens = [
                en.SimpleEntity(entity)
                if not (entity in named_entities.get(data[2])) else
                en.NamedEntity(entity) for entity in data[5].split(" ")
            ]
            tweet = tw.Tweet(clst_id=data[0],
                             id=data[2],
                             timestamp_ms=data[3],
                             user_id=data[4],
                             tokens=tokens,
                             content=data[6])
            tweets.append(tweet)
            if not clusters.get(data[0]):
                new_cl = cluster.Cluster(clst_id=data[0],
                                         clst_name=data[1],
                                         created_time=data[3])
                clusters[data[0]] = new_cl
                oldest_valid_time = new_cl.get_created_time(
                ) - ed.Constants.EPOCH * 8
                for past_cluster in clusters_timeline[::-1]:
                    if past_cluster.get_created_time() >= oldest_valid_time:
                        new_cl.add_past_neighbour(past_cluster)
                    else:
                        break
                clusters_timeline.append(new_cl)

            clusters[data[0]].append(tweet)

        for clst_id in clusters:
            clusters[clst_id].aggregate_entities()
        return tweets, clusters
Beispiel #18
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters
    Note: the function may not mutate cluster_list

    Input: List of clusters, integers number of clusters and number of iterations
    Output: List of clusters whose length is num_clusters
    """
    # position initial clusters at the location of clusters with largest populations
    sorted_cluster = list(cluster_list)
    sorted_cluster.sort(key=lambda cluster: cluster.total_population())
    centers = list()
    for idx_k in range(1, num_clusters + 1):
        horiz = sorted_cluster[len(sorted_cluster) - idx_k].horiz_center()
        vert = sorted_cluster[len(sorted_cluster) - idx_k].vert_center()
        centers.append((horiz, vert))
    assert len(centers) == num_clusters  # force k centers

    # clustering
    for idx_i in range(num_iterations):
        # initialize k empty clusters
        clusters = list()
        for idx_k in range(num_clusters):
            clusters.append(
                cluster.Cluster(set(), centers[idx_k][0], centers[idx_k][1], 0,
                                0))
        # assigning closest points
        for idx_j in range(len(cluster_list)):
            min_dist = float('inf')
            for idx_k in range(num_clusters):
                # compute the distance
                vert = cluster_list[idx_j].vert_center() - centers[idx_k][1]
                horiz = cluster_list[idx_j].horiz_center() - centers[idx_k][0]
                dist = math.sqrt(vert**2 + horiz**2)
                if dist < min_dist:
                    merge_cluster = clusters[idx_k]
                    min_dist = dist
            merge_cluster.merge_clusters(cluster_list[idx_j])
        # adjusting the cluster centers
        if idx_i < num_iterations - 1:
            for idx_k in range(num_clusters):
                horiz = clusters[idx_k].horiz_center()
                vert = clusters[idx_k].vert_center()
                centers[idx_k] = (horiz, vert)

    return clusters
Beispiel #19
0
def form_clusters_alt(instances, l_idx):
    '''
		instances - list of clustered things
		l_idx - list of lists of indices into instances
			e.g. [ [1, 3, 5], [0, 2, 4] ]
	'''
    clusters = list()
    m = MockCenter()
    m.label = None
    for x, l in enumerate(l_idx):
        _cluster = cluster.Cluster(list(), m)
        for idx in l:
            _cluster.members.append(instances[idx])
        clusters.append(_cluster)
    clusters = filter(lambda c: len(c.members), clusters)
    map(lambda _cluster: _cluster.set_label(), clusters)
    return clusters
Beispiel #20
0
    def __enter__(self):
        self.last_unused_port = 12247
        import random
        self.clusterName = ''.join(
            [chr(random.choice(range(ord('a'), ord('z')))) for c in range(8)])
        self.num_hosts = 8
        require_hosts(self.num_hosts)
        self.servers = []
        self.cluster = cluster.Cluster()
        #self.cluster.verbose = True
        self.cluster.log_level = 'DEBUG'
        self.cluster.transport = 'infrc'
        self.cluster.__enter__()

        try:
            self.cluster.start_coordinator(hosts[0])
            # Hack below allows running with an existing coordinator
            #self.cluster.coordinator_host = hosts[0]
            #self.cluster.coordinator_locator = cluster.coord_locator(self.cluster.transport,
            #                                                         self.cluster.coordinator_host)
            syncArgs = ''
            if hasattr(getattr(self, self._testMethodName), 'sync'):
                syncArgs = '--sync'
            for host in hosts[:self.num_hosts]:
                self.servers.append(
                    self.cluster.start_server(host,
                                              args='--clusterName=%s %s' %
                                              (self.clusterName, syncArgs)))
                # Hack below can be used to use different ports for all servers
                #self.servers.append(
                #    self.cluster.start_server(host,
                #        port=self.last_unused_port,
                #        args='--clusterName=%s' % self.clusterName))
                #self.last_unused_port += 1
            self.cluster.ensure_servers()

            self.rc = ramcloud.RAMCloud()
            print('%s ... ' % self.cluster.log_subdir, end='', file=sys.stderr)
            self.rc.set_log_file(
                os.path.join(self.cluster.log_subdir, 'client.log'))
            self.rc.connect(self.cluster.coordinator_locator)
        except:
            self.cluster.__exit__()
            raise
        return self
Beispiel #21
0
def form_clusters(true_labels, predicted_labels):
	cluster_map = dict()
	class Mock:
		pass
	for x in xrange(predicted_labels.max() + 1):
		m = Mock()
		m.label = None
		cluster_map[x] = cluster.Cluster(list(), m, x)
	count = 0
	for true, predicted in zip(true_labels, predicted_labels):
		m = Mock()
		m._id = count
		count += 1
		m.label = true
		cluster_map[predicted].members.append(m)
	clusters = filter(lambda cluster: cluster.members, cluster_map.values())
	map(lambda cluster: cluster.set_label(), clusters)
	return clusters
def em_random_restarts(num_restarts, num_clusters, parameters_df):
    ll = None
    clusters = None

    for i in range(num_restarts):
        # Initialize cluster statistics
        new_clusters = [cl.Cluster() for j in range(num_clusters)]
        cl.initialize_clusters(new_clusters, parameters_df)

        # Run EM & Get LL Value
        new_ll = em.em(parameters_df, new_clusters)

        # Save Best LL Value and its clusters
        if ll is None or new_ll > ll:
            ll = new_ll
            clusters = new_clusters

    return ll, clusters
Beispiel #23
0
def form_clusters(instances, assignments):
	print "Forming Clusters"
	cluster_map = dict()
	class Mock:
		pass
	for x in xrange(_number_of_clusters):
		m = Mock()
		m.label = None
		cluster_map[x] = cluster.Cluster(list(), m, x)
	for instance, assignment in zip(instances, assignments):
		m = Mock()
		m._id = instance[0]
		m.label = instance[1]
		cluster_map[assignment].members.append(m)
	clusters = cluster_map.values()
	map(lambda cluster: cluster.set_label(), clusters)
	print "Done\n"
	return clusters
Beispiel #24
0
    def __init__(self, **kwargs):
        self.tests = {}
        self.all_solutions = {}
        self.name = 'NO NAME'
        self.cached_solutions = None
        self.skip_count = 0
        self.clusters = {}
        self.cluster_size = 0
        self.dimentions = None
        self.output = None
        self.is_tl_hidden = True
        self.is_rt_hidden = True
        self.is_wa_hidden = True
        self.is_changed = False
        self.sources_path = ''
        self.tests_path = ''

        if 'file' in kwargs:
            data = json.load(kwargs['file'])
            self.clusters = dict(
                (key, [cluster.Cluster(obj=c) for c in data['clusters'][key]])
                for key in data['clusters'])
            self.all_solutions = dict(
                (e['name']['file'], solution.Solution(self, obj=e))
                for e in data['solutions'])
            self.tests = dict(
                (e['name'], testf.Test(obj=e)) for e in data['tests'])
            self.is_tl_hidden = data.get('is_tl_hidden', True)
            self.is_rt_hidden = data.get('is_rt_hidden', True)
            self.is_wa_hidden = data.get('is_wa_hidden', True)
            self.cluster_size = data.get('cluster_size', 0)
            self.name = data.get('name', 'NO NAME')
            self.sources_path = data.get(
                'sources_path',
                os.path.split(list(
                    self.all_solutions.values())[0].filepath)[0])
            self.tests_path = data.get(
                'tests_path',
                os.path.split(list(self.tests.values())[0].name)[0])

        if 'output' in kwargs:
            self.output = kwargs['output']
Beispiel #25
0
    def clusterize(self, count):
        if self.current_cluster_name(count) in self.clusters:
            self.update_current_cluster_name(count)
            self.is_changed = True
            return

        self.update_current_cluster_name(count)
        labels, centers = clustering.clusterize(self, kmax=count)
        labels = self.sort_labels(labels)
        self.is_changed = True
        cluster_name = self.current_cluster_name()
        self.clusters[cluster_name] = [
            cluster.Cluster(idx=idx,
                            name='cluster {}'.format(idx),
                            description='',
                            center=center.tolist())
            for idx, center in enumerate(centers)
        ]
        for idx, label in enumerate(labels):
            self.clusters[cluster_name][label].elements.append(idx)
Beispiel #26
0
def kmeans(tweets, k, maxRound, cutoff):
    init = random.sample(tweets, k)  # randomly sample k tweets
    clusters = [cluster.Cluster(t)
                for t in init]  # Use the init set as k separate clusters

    round = 0
    while round < maxRound:
        #print 'Round #%s<br>' % round
        lists = [[] for c in clusters]  # Create an empty list for each cluster
        for t in tweets:
            # Compute distances to each of the cluster
            dist = [
                float(tweet_distance(t, clusters[i].centroid)) /
                min(len(tokenise(t)), len(tokenise(clusters[i].centroid)))
                for i in range(len(clusters))
            ]

            # Find the max, which indicate the most similarity
            maxDist = max(dist)
            idx = dist.index(maxDist)

            # If the tweet doesn't fit into any cluster (below a threshold), randomly assign it to a cluster, otherwise, assign it to the cluster with maximum distance
            if maxDist < cutoff:
                lists[random.sample(range(k), 1)[0]].append(t)
            else:
                lists[idx].append(t)

        # Update the clusters
        biggest_shift = 0.0
        for i in range(len(clusters)):
            shift = clusters[i].update(lists[i])
            biggest_shift = max(biggest_shift, shift)

        # If the clusters aren't shifting much (i.e. twitter distance remain high), break and return the results
        if biggest_shift > cutoff:
            break

        round = round + 1

    #print "Done clustering...<br>"
    return clusters
Beispiel #27
0
    def update(self):
        movements.move_path(self)
        if self.attitude == Attitude["friends"]:
            print "runway ", self.runaway
        if self.attitude == Attitude["friends"]:
            self.run_away()
            #print self.runaway
            if self.runaway >= 100:
                print "change attitude"
                self.cluster.remove_member(self)
                self.cluster = cluster.Cluster(
                    attitude=Attitude["friendly"],
                    starting_positions=[list(self.rect.center)],
                    start_position=self.rect.bottom)
                game.main_game.add_clusters([self.cluster])

                self.change_attitude(Attitude["friendly"])
                self.set_path(movements.happy_dance, default=True)
                self.cluster.add_cluster(happiness=self.happiness,
                                         movement=self.default_movement)
        if self.fadeaway:
            self.fade_away()
def gen_random_clusters(num_clusters):
    '''
    creates a list of clusters where each cluster in this list corresponds to one randomly generated 
    point in the square with corners (±1,±1)
    
    '''
    random_data1 = [cluster.Cluster(set(), random.random(), random.random(), 0, 0) for _ in range(num_clusters)]
    random_data2 = random_data1[:]

    time1 = time.time()
    for _ in range(100):
        dist = slow_closest_pair(random_data1)
    time2 = time.time()
    slow_time = (time2 - time1)/100

    time1 = time.time()
    for _ in range(100):
        dist = fast_closest_pair(random_data2)
    time2 = time.time()
    fast_time = (time2 - time1)/100
    
    
    return (slow_time, fast_time)
Beispiel #29
0
Datei: main.py Projekt: anbet/udo
    def cluster(self, *args):
        args = list(args)
        if not len(args) or not args[0]:
            print "cluster command requires an action. Valid actions are: "
            print " list\n status"
            return
        action = args.pop(0)

        if action == 'list':
            cluster.list()
        else:
            # actions that require a cluster name
            if not len(args) or not args[0]:
                print "cluster name required for {}".format(action)
                return
            cluster_name = args.pop(0)
            cl = cluster.Cluster(cluster_name)
            if action == 'status':
                print "{} status: {}".format(cluster_name, cl.status())
            elif action == 'create':
                if not cl.create():
                    print "Failed to bring up {} cluster".format(cluster_name)
            else:
                print "Unknown cluster command: {}".format(action)
Beispiel #30
0
def kmeans_clustering(cluster_list, num_clusters, num_iterations):
    """
    Compute the k-means clustering of a set of clusters.

    The initial points used as the centers are the clusters with largest
    population.

    Input: List of clusters, number of clusters, number of iterations
    Output: List of clusters of length num_clusters
    """
    srtd_clstrs = sorted(cluster_list,
                         key=lambda x: x.total_population(),
                         reverse=True)
    centers = srtd_clstrs[:num_clusters]
    for _ in range(num_iterations):
        clstrs = [
            cluster.Cluster(set([]), cntr.horiz_center(), cntr.vert_center(),
                            0, cntr.averaged_risk()) for cntr in centers
        ]
        for c in cluster_list:
            min_idx = min_pair(c, centers)
            clstrs[min_idx].merge_clusters(c)
        centers = clstrs
    return clstrs