def prepare_fn(inst): # get good centroids x, y = requires(['x', 'y'], inst) if name is not None: full_name = 'centroids_' + name + '_l_method' file = 'storage/' + full_name + '.json' cache = StorageCache(file) if 'cache' in locals() and not cache.isnew(): centroids = np.array(cache.get()) else: # get the 'good' centroids result = Pipe() \ .x(x) \ .y(y) \ .pipe(agglomerative_l_method()) \ .connect(stop()) if not 'centroids' in result: raise Exception('no centroids in pipe') centroids = result['centroids'] if 'cache' in locals(): # update the cache and save to the storage cache.update(array_to_list(centroids)) cache.save() return inst.set('l_method_centroids', centroids)
def prepare_fn(inst): if not bandwidth: raise Exception('no bandwidth given!') # get good centroids x = requires('x', inst) if name is not None: full_name = 'centroids_' + name + '_denclue_bandwidth_' + str(bandwidth) file = 'storage/' + full_name + '.json' cache = StorageCache(file) if 'cache' in locals() and not cache.isnew(): # load good centroids from storage and convert to np array centroids = np.array(cache.get()) else: if len(x) < 200: sample_size = len(x) else: # 200 < sample_size * 0.2 < 10000 sample_size = max(min(10000, int(len(x) * 0.2)), 200) # get the 'good' centroids centroids = denclue(x, bandwidth, sample_size) if 'cache' in locals(): # update cache, save to the storage cache.update(array_to_list(centroids)) cache.save() return inst\ .set('denclue_centroids_' + id, centroids)\ .set('denclue_bandwidth_' + id, bandwidth)
def map_fn(inst, idx, total): # now using caching technique to have the consistent result for each runtime file = 'seeding/' + name + '_' + seeding_names[idx] + '.json' cache = StorageCache(file) if not cache.isnew(): y_seed = np.array(cache.get()) else: seeding_fn = seeding_fns[idx] y_seed = seeding_fn(inst) # save to the cache cache.update(array_to_list(y_seed)) cache.save() return inst \ .set('y_seed', y_seed) \ .set('name', seeding_names[idx])
def seed_cache(file): file = 'seeding/' + file cache = StorageCache(file) y_seed = np.array(cache.get()) return y_seed
from cache import StorageCache file = 'test_cache.json' cache = StorageCache(file) data = cache.get() data['b'] = 10 cache.save() cache2 = StorageCache(file) assert cache2.has('b')
def plot(X, **kwargs): x, y = list(zip(*X)) plt.scatter(x, y, **kwargs) group = {} for x, y in zip(X, Y): if y not in group: group[y] = [] group[y].append(x) for i, (name, points) in zip(range(dataset.cluster_cnt), group.items()): # plot X on it deverse it using color according to different Y print('color:', cmap(i)) print('count:', len(points)) plot(points, color=cmap(i)) cache = StorageCache('seeding/' + dataset.name + '_some-1-prob-0.1.json') seeds = cache.get() seeds = list(map(lambda x: x[1], filter(lambda x: x[0], zip(seeds, dataset.X)))) seeds = pca.transform(seeds) plot(seeds, color='grey') cache = StorageCache('storage/centroids_iris_denclue_bandwidth_0.1082972972972973.json') centroids = np.array(cache.get()) centroids = pca.transform(centroids) plot(centroids, color='black') plt.show()