def forward(self, embeds, cluster_temp, num_iter=10): mu_init, _ = cluster(embeds, self.K, 1, num_iter, cluster_temp = torch.tensor(cluster_temp), init = self.init) #self.init = mu_init.clone().detach() mu, r = cluster(embeds, self.K, 1, 1, cluster_temp = torch.tensor(cluster_temp), init = mu_init.clone().detach()) return mu, r
def DBSCAN(X: np.ndarray, r: float, minPts: int): pointnum = X.shape[0] v = visitlist(pointnum) clustersSet = list() noise = cluster(-1) tree = KDTree(X) k = 0 while v.unvisitednum > 0: randid = random.choice(v.unvisitedlist) v.visit(randid) N = tree.query_ball_point(X[randid], r) if len(N) < minPts: noise.points.append(randid) else: clus = cluster(k) clus.points.append(randid) N.remove(randid) while len(N) > 0: p = N.pop() if p in v.unvisitedlist: v.visit(p) clus.points.append(p) pN = tree.query_ball_point(X[p], r) if len(pN) >= minPts: pN.remove(p) N = N + pN clustersSet.append(clus) clustersSet.append(noise) return clustersSet
def main(): from argparse import ArgumentParser from time import time parser = ArgumentParser() parser.add_argument('--file-list', type=str, default='/fastdata/finder/streetview_train.txt', help='path to the streetview training file') parser.add_argument('-n', '--n-clusters', type=int, default=100, help='number of cluster') parser.add_argument('--max-files', type=int, help='maximum number of files to cluster') parser.add_argument('output', type=str, help='output file (e.g. clusters.npy)') args = parser.parse_args() cluster(args.file_list, args.output, args.n_clusters, args.max_files)
def clean_data(datalist): x = datalist.values[::, 0:14] #y = datalist.values[::, 14:] df = pd.DataFrame(x) df = df.replace("?", float('nan')) x = df.fillna(0) # замена ? на 0 replace_mas1 = ["notpresent", "yes", "good"] replace_mas0 = ["present", "no", "poor"] for text in replace_mas1: #замена notpresent", "yes", "good" на 1 x = x.replace(text, 1) for text in replace_mas0: x = x.replace(text, 0) cluster(x)
def forward(self, x, adj, num_iter=1): embeds = self.GCN(x, adj) mu_init, _, dist = cluster(embeds, self.K, num_iter, cluster_temp=self.cluster_temp, init=self.init) mu, r, dist_2 = cluster(embeds, self.K, 1, cluster_temp=self.cluster_temp, init=mu_init.detach().clone()) return r, dist
def forward(self, x): mu_init, _, _ = cluster(x, self.K, 1, self.num_iter, cluster_temp=self.cluster_temp, init=self.init) mu, r, dist = cluster(x, self.K, 1, 1, cluster_temp=self.cluster_temp, init=mu_init.detach().clone()) return r
def plot(self, data, views=2, show=False): """plot clustering""" # get plotting tools try: from spikeplot import plt, cluster except ImportError: return None # init views = min(views, int(data.shape[1] / 2)) fig = plt.figure() fig.suptitle('clustering [%s]' % self.clus_type) ax = [fig.add_subplot(2, views, v + 1) for v in xrange(views)] axg = fig.add_subplot(212) ncmp = int(self.labels.max() + 1) cdata = dict(zip(xrange(ncmp), [data[self.labels == c] for c in xrange(ncmp)])) # plot clustering for v in xrange(views): cluster( cdata, data_dim=(2 * v, 2 * v + 1), plot_handle=ax[v], plot_mean=sp.sqrt(self.sigma_factor), xlabel='PC %d' % int(2 * v), ylabel='PC %d' % int(2 * v + 1), show=False) # plot gof axg.plot(self._gof, ls='steps') for i in xrange(1, len(self.crange)): axg.axvline(i * self.repeats - 0.5, c='y', ls='--') axg.axvspan(self._winner - 0.5, self._winner + 0.5, fc='gray', alpha=0.2) labels = [] for k in self.crange: labels += ['%d' % k] labels += ['.'] * (self.repeats - 1) axg.set_xticks(sp.arange(len(labels))) axg.set_xticklabels(labels) axg.set_xlabel('cluster count and repeats') axg.set_ylabel(str(self.gof_type).upper()) axg.set_xlim(-1, len(labels)) # show? if show is True: plt.show() return True
def __init__(self, page_tree, k_max_depth=2, k_decay=0.5, c_eps=1.2, c_d1=1.0, c_d2=1.0, separate_descendants=True): """Perform all extraction operations in sequence. Parameters ---------- k_max_depth : int Parameter to kernel computation k_decay : float Parameter to kernel computation c_eps : float Parameter to clustering c_d1 : float Parameter to clustering c_d2 : float Parameter to clustering separate_descendants : bool Parameter to clustering """ self.page_tree = page_tree self.kernel = _ker.kernel(page_tree, max_depth=k_max_depth, decay=k_decay) self.labels = cluster( page_tree, self.kernel, eps=c_eps, d1=c_d1, d2=c_d2, separate_descendants=separate_descendants) self.items = extract_items(page_tree, self.labels) self.tables = [ItemTable(items, extract_item_table(page_tree, items, self.labels)) for items in self.items] self.table_fragments = [ ItemTable([page_tree.fragment_index(np.array(root)) for root in item], page_tree.fragment_index(fields)) for item, fields in self.tables]
def preprocess_y(self,settings): ''' Returns preprocessed observables ''' if self.preprocess_labels == None: labels = self.y()[:,settings.label_index] else: labels = self.preprocess_labels Y = [] # Add a bias to the data by clustering the baseline observables # try up to 6 clusters for k in range(1,9): # cluster the centroids of each label #x = self.y()[:,[settings.baseline_index,4,7,9]].astype(np.float) x = self.y()[:,settings.baseline_index].astype(np.float) y = self.y()[:,settings.observable_index].astype(np.float) classification = cluster(k,x, labels) self.classification[k-1] = classification.copy() y = self.y()[:,settings.observable_index].astype(np.float) # preprocess the observables y[classification == k-1] = y[classification == k-1] \ - np.median(y[classification == k-1]) \ + np.median(x[classification == k-1]) Y.append(y.copy()) return Y
def preprocess_y(self, settings): ''' Returns preprocessed observables ''' if self.preprocess_labels == None: labels = self.y()[:, settings.label_index] else: labels = self.preprocess_labels Y = [] # Add a bias to the data by clustering the baseline observables # try up to 6 clusters for k in range(1, 9): # cluster the centroids of each label #x = self.y()[:,[settings.baseline_index,4,7,9]].astype(np.float) x = self.y()[:, settings.baseline_index].astype(np.float) y = self.y()[:, settings.observable_index].astype(np.float) classification = cluster(k, x, labels) self.classification[k - 1] = classification.copy() y = self.y()[:, settings.observable_index].astype(np.float) # preprocess the observables y[classification == k-1] = y[classification == k-1] \ - np.median(y[classification == k-1]) \ + np.median(x[classification == k-1]) Y.append(y.copy()) return Y
def main(): n_clusters = int(sys.argv[1]) print suffix + ' ' + str(n_clusters) vocabpath = basepath+'data/vocab'+suffix+'.txt' vocab =[word.strip() for word in open(vocabpath).readlines()] vocab = vocab[:voc_size] matpath = basepath + 'data/F-rels'+str(voc_size)+suffix+'.mat' datapath = basepath+'data/all-BNC-EN.txt' print str(n_clusters) if load<1: fmatrix = matbuild(datapath,vocab) scipy.io.savemat(matpath,{'fmatrix':fmatrix}) else: a = scipy.io.loadmat(matpath) fmatrix = a['fmatrix'] for i in range(numpy.size(fmatrix,0)): fmatrix[i,:] = fmatrix[i,:]/sum(fmatrix[i,:]+eps) if load<2: simmatrix = simbuild(fmatrix) scipy.io.savemat(basepath+'data/F-sims'+str(voc_size)+suffix+'.txt.mat', {'simmatrix': simmatrix}) else: a = scipy.io.loadmat(basepath+'data/F-sims'+str(voc_size)+suffix+'.txt.mat') simmatrix = a['simmatrix'] outputfile = basepath+'data/optlabels-'+str(n_clusters)+'-'+str(voc_size)+suffix+'.mat' if load<3: (labels,score) = cluster(simmatrix, n_clusters, outputfile) else: A = scipy.io.loadmat(outputfile) labels = A['labels'] outpath = basepath + 'data/output'+str(n_clusters)+'-'+str(voc_size)+suffix+'.txt' outwrite(vocab,labels, outpath)
def step(dataset, nn): # 1. Do a forward pass dataset['y'] = nn.fwd(dataset['x']) # 2. Perform clustering clusters = cluster(dataset) # 3. Calculate the distances between the distributions distances = calc_distances(clusters) # 4. Displace the center of each cluster using force directed graph for c in clusters: c.calculate_displacement(clusters) # 5. Set the target position of each item in the cluster to the mean of the cluster x, y = extract_training_data(clusters) # 6. Train the CNN\ for j in range(100): nn.train(x, y) # 7. Plot the clusters plot_clusters(clusters) return clusters
def cluster_all_targets(language, N=8): logging.info('clustering {}'.format(lang)) data = load(language) results = [] for w in tqdm.tqdm(data.words): results.append((w, *cluster(data, w))) return results
def do_segmentation(C, M, config, in_bound_idxs=None): embedding = embed_beats(C, M, config) Cnorm = np.cumsum(embedding ** 2, axis=1) ** 0.5 if config["hier"]: est_idxs = [] est_labels = [] for k in range(1, config["num_layers"] + 1): est_idx, est_label = cluster(embedding, Cnorm, k) est_idxs.append(est_idx) est_labels.append(np.asarray(est_label, dtype=np.int)) else: est_idxs, est_labels = cluster(embedding, Cnorm, config["scluster_k"], in_bound_idxs) est_labels = np.asarray(est_labels, dtype=np.int) return est_idxs, est_labels, Cnorm
def do_segmentation(C, M, config, in_bound_idxs=None): embedding = embed_beats(C, M, config) Cnorm = np.cumsum(embedding**2, axis=1)**0.5 if config["hier"]: est_idxs = [] est_labels = [] for k in range(1, config["num_layers"] + 1): est_idx, est_label = cluster(embedding, Cnorm, k) est_idxs.append(est_idx) est_labels.append(np.asarray(est_label, dtype=np.int)) else: est_idxs, est_labels = cluster(embedding, Cnorm, config["scluster_k"]) est_labels = np.asarray(est_labels, dtype=np.int) return est_idxs, est_labels, Cnorm
def labeladjust(lang, n, lofeatures, feature, df): table = cluster(lang, n, lofeatures, df) d = [0] * n for i in range(0, n): d[i] = (i, min(table.loc[table['label'] == i, feature])) order = sorted(d, key=lambda x: x[1]) dic = {} for i in range(0, n): dic[order[i][0]] = i table['label'] = table['label'].replace(dic) return (table)
def getbreaks(lang, n, lofeatures, feature, df): table = cluster(lang, n, lofeatures, df) minmax = [0] * n for i in range(0, n): minmax[i] = (min(table.loc[table['label'] == i, feature]), max(table.loc[table['label'] == i, feature])) breaks = [0] * (n - 1) ordered = sorted(minmax) for i in range(0, n - 1): breaks[i] = numpy.mean([ordered[i][1], ordered[(i + 1)][0]]) return (breaks)
def laplacian_segmentation(matrix): #pass in an affinity matrix matrix = np.matrix(matrix, dtype=int) matrix = np.maximum(matrix, matrix.transpose()) embedding = decompose(matrix) Cnorm = np.cumsum(embedding**2, axis=1)**0.5 segmentations = [] for k in range(1, MAX_TYPES): segmentations.append(cluster(embedding, Cnorm, k)) return reindex(segmentations)
def main(): # get command line aguments args = sys.argv # m - # of sets of sweetwords # n - # of sweetwords in each set m = int(args[1]) n = int(args[2]) input_file = args[3] # store input passwords password_list = [] sweetwords = read_password_file(input_file) filename = "password_choice.txt" # Identify password for each set of sweetwords for row in range(0,m): sweetwords_list = [] sep = ',' sweetwords_list.append(sweetwords[row].split(sep,n)) roots = cluster(sweetwords_list, n) root_list = [] for key,val in roots.items(): # print(key,val) length = len(val) root_list.append([key,length]) # root_list.append(key) root_list=sorted(root_list,key=lambda x: x[1]) root_list_processed =[] # print(root_list) if len(root_list) > 5: root_list_processed = [root_list[0][0],root_list[1][0],root_list[-1][0],root_list[-2][0]] else: for item in root_list: root_list_processed.append(item[0]) print(root_list_processed) shuffle(root_list_processed) chosen = choose_pass(root_list_processed) print("Final Answer", chosen) # password_choice = "".join(str(x) for x in passSelect(roots)[0]) # print("Password Guess #",row, ": ", password_choice) password_choice_ind = sweetwords_list[0].index(chosen) password_list.append(str(password_choice_ind)) write_passwordchoice_file(filename, password_list)
def _run_training_epoch(self, **kwargs): """ """ # predict clusters for each data point predictions = self._pred_model.predict(self._pred_ds, steps=self._pred_steps) # if test data was included- also predict outputs for those if self._test: test_preds = self._pred_model.predict(self._test_ds, steps=self._test_steps) else: test_preds = None # run k-means y_e, clusters, test_labels = cluster(predictions, self.config["pca_dim"], self.config["k"], init='k-means++', kmeans_max_iter=self.config["kmeans_max_iter"], kmeans_batch_size=self.config["kmeans_batch_size"], testvecs=test_preds) self._old_test_labels = self._test_labels self._test_labels = test_labels # record the normalized mutual information between these labels and previous if self._old_cluster_assignments is not None: nmi = normalized_mutual_info_score(y_e, self._old_cluster_assignments, average_method="arithmetic") self._record_scalars(normalized_mutual_information=nmi) self._old_cluster_assignments = y_e # reset the weights of the output layer new_weights = [self._initializer(x.shape) for x in self._output_layer.get_weights()] self._output_layer.set_weights(new_weights) # do some training train_ds, num_steps = stratified_training_dataset(self.trainingdata, y_e, imshape=self.input_config["imshape"], num_channels=self.input_config["num_channels"], num_parallel_calls=self.input_config["num_parallel_calls"], batch_size=self.input_config["batch_size"], mult=self.config["mult"], augment=self.augment_config, sobel=self.input_config["sobel"], single_channel=self.input_config["single_channel"]) for x, y in train_ds: loss = self._training_step(x, y, self._models["full"], self._optimizer) self._record_scalars(training_crossentropy=loss) self.step += 1
def forward(self, x, adj, num_iter=1): # print("INSIDE FORWARD") embeds = self.GIN(x, adj) # print("EMBEDS\n\n",embeds) mu_init, _, _ = cluster(embeds, self.K, 1, num_iter, cluster_temp=self.cluster_temp, init=self.init) mu, r, dist = cluster(embeds, self.K, 1, 1, cluster_temp=self.cluster_temp, init=mu_init.detach().clone()) # print("PRINTING") # print("mu\n\n",mu) # print("r\n\n",r) # print("embeds\n\n",embeds) # print("dist\n\n",dist) return mu, r, embeds, dist
def recluster(df, cl, clusters, n_clusters): lbls = cl.labels_ mask = np.array([False for i in range(len(lbls))]) for c in clusters: mask |= lbls==c subpipe, results = data_pipeline(df[mask]) ##use cosine similarity! NLTK clustering implementation #KMeans cluster object as carrier for consistency subcl = cluster(results, n_clusters) kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=50) assigned_clusters = kclusterer.cluster(results, assign_clusters=True) #assign new cluster labels and cluster centroids subcl.labels_ = np.array(assigned_clusters) subcl.cluster_centers_ = np.array(kclusterer.means()) return subpipe, subcl, results, df[mask]
def segment_file(filename): print('Loading {}'.format(filename)) y, sr = librosa.load(filename) print('Extracting features...'.format(filename)) Csync, Msync, beat_times = make_beat_sync_features(y=y, sr=sr) print('Constructing embedding...'.format(filename)) embedding = embed_beats(Csync, Msync) Cnorm = np.cumsum(embedding**2, axis=1)**0.5 print('Clustering...'.format(filename)) segmentations = [] for k in range(1, MAX_TYPES): print('\tk={}'.format(k)) segmentations.append(cluster(embedding, Cnorm, k, beat_times)) print('done.') return reindex(segmentations)
def main(): from argparse import ArgumentParser from time import time parser = ArgumentParser() parser.add_argument('-bs', '--batch-size', type=int, default=32, help='batch size') parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4, help='learning rate') parser.add_argument('-nit','--number-of-iterations', type=int, default=1000000, help='number of iterations') parser.add_argument('--log_device_placement', action='store_true') parser.add_argument('--file-list', type=str, default='/fastdata/finder/streetview_train.txt', help='path to the streetview training file') parser.add_argument('--file-base-dir', type=str, default='/fastdata/finder/streetview/', help='directory of the training images') parser.add_argument('--clusters', type=str, default=None, help='cluster file (computed with cluster.py)') parser.add_argument('-n', '--n-clusters', type=int, default=None, help='Number of clusters to be used (if \'--clusters\' is not specified)') parser.add_argument('--initial-weights', type=str, help='VGG weights in hdf5 format') parser.add_argument('--num-gpus', type=int, default=1, help='How many GPUs should we use? (-1 for all available GPUs)') parser.add_argument('train_dir', help='output directory for the model and log files') args = parser.parse_args() try: os.makedirs(args.train_dir) except: pass cluster_file = args.train_dir+'clusters.npy' if args.clusters: import shutil shutil.copyfile(args.clusters, cluster_file) if args.n_clusters is not None: print( 'Warning clusters and n_clusters both specified! Ignoring n_clusters.' ) elif not os.path.exists(cluster_file): print( 'No cluster file provided, clustering (this might take a while)' ) from cluster import cluster cluster(args.file_list, cluster_file, args.n_clusters) args.n_clusters = getNCluster(cluster_file) files = [os.path.join(args.file_base_dir,l.strip()) for l in open(args.file_list,'r')] # Fire up tensorflow import tensorflow as tf # Detect the number of GPUs if args.num_gpus < 0: args.num_gpus = getNumGPU() print( 'Found %d GPUs. Using all of them'%args.num_gpus ) ## Setup the graph ## # Get the data gpu_batch_size = (args.batch_size-1) // args.num_gpus + 1 total_batch_size = gpu_batch_size * args.num_gpus data,gt = glocData(files, cluster_file, batch_size=total_batch_size) # Setup the solver solver = tf.train.AdamOptimizer(learning_rate=args.learning_rate) # Setup the network and loss if args.num_gpus != 1: # Multi gpu VGG split_data = tf.split(0, args.num_gpus, data) split_gt = tf.split(0, args.num_gpus, gt) all_loss, all_grads = [], [] vars = None for i,(d,g) in enumerate(zip(split_data,split_gt)): with tf.device('/gpu:%d'%i) as dev: # Define VGG vgg = vgg16(d, n_out=args.n_clusters) # Share the parameters tf.get_variable_scope().reuse_variables() # Compute the loss and gradients (per device) avg_vgg = tf.reduce_mean(tf.reduce_mean(vgg,1),1) loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(avg_vgg, g)) grads_and_vars = solver.compute_gradients(loss) # Collect all outputs ... all_grads.append( [g for g,v in grads_and_vars] ) all_loss.append( loss ) if vars is None: vars = [v for g,v in grads_and_vars] else: assert np.all([v==vv for vv,(g,v) in zip(vars,grads_and_vars)]), "Variables differ between GPUs" # .. and concat or sum them up grads = [tf.add_n(g)/len(g) for g in zip(*all_grads)] grads_and_vars = list(zip(grads, vars)) loss = tf.add_n(all_loss) / len(all_loss) else: vgg = vgg16(data, n_out=args.n_clusters) avg_vgg = tf.reduce_mean(tf.reduce_mean(vgg,1),1) loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(avg_vgg, gt)) grads_and_vars = solver.compute_gradients(loss) solver_step = solver.apply_gradients(grads_and_vars) # Create some summaries loss_avg = tf.train.ExponentialMovingAverage(0.9, name='avg') tf.scalar_summary('loss', loss) loss_avg_op = loss_avg.apply([loss]) tf.scalar_summary('loss(avg)', loss_avg.average(loss)) with tf.control_dependencies([loss_avg_op]): loss = tf.identity(loss) for grad,var in grads_and_vars: tf.scalar_summary(var.op.name+'/norm', tf.reduce_mean(var*var)) tf.scalar_summary(var.op.name+'/gradient_norm', tf.reduce_mean(grad*grad)) tf.scalar_summary(var.op.name+'/gradient_ratio', tf.reduce_mean(grad*grad) / tf.reduce_mean(var*var)) summary_op = tf.merge_all_summaries() # Initialize ops saver = tf.train.Saver(tf.all_variables()) init_op = tf.initialize_all_variables() if args.initial_weights is not None: from slim import load load_op = load.loadH5(args.initial_weights) else: load_op = tf.no_op() tf.get_default_graph().finalize() with tf.Session(config=tf.ConfigProto(log_device_placement=args.log_device_placement, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.6))) as sess: # Initialize stuff summary_writer = tf.train.SummaryWriter(args.train_dir, sess.graph) global coord coord = tf.train.Coordinator() import signal def stop(*args): global coord print("Training stopped") coord.request_stop() old_sigint = signal.signal(signal.SIGINT, stop) threads=tf.train.start_queue_runners(sess=sess, coord=coord) sess.run(init_op) sess.run(load_op) # Train loss_values = [] for it in range(args.number_of_iterations): t0 = time() _, loss_value = sess.run([solver_step, loss]) t1 = time() loss_values.append( loss_value ) if it % 10 == 0: print('%8d, loss = %0.2f [%0.2f] (%0.1f im/sec)'%(it, loss_value, np.mean(loss_values), args.batch_size / (t1-t0))) loss_values = loss_values[-20:] if it % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, it) if it % 1000 == 0: saver.save(sess, os.path.join(args.train_dir, 'snap.ckpt'), global_step=it) if coord.should_stop(): break saver.save(sess, os.path.join(args.train_dir, 'final.ckpt')) coord.join(threads) signal.signal(signal.SIGINT, old_sigint)
def main(): from argparse import ArgumentParser from time import time parser = ArgumentParser() parser.add_argument('-bs', '--batch-size', type=int, default=32, help='batch size') parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4, help='learning rate') parser.add_argument('-nit', '--number-of-iterations', type=int, default=1000000, help='number of iterations') parser.add_argument('--log_device_placement', action='store_true') parser.add_argument('--file-list', type=str, default='/fastdata/finder/streetview_train.txt', help='path to the streetview training file') parser.add_argument('--file-base-dir', type=str, default='/fastdata/finder/streetview/', help='directory of the training images') parser.add_argument('--clusters', type=str, default=None, help='cluster file (computed with cluster.py)') parser.add_argument( '-n', '--n-clusters', type=int, default=None, help= 'Number of clusters to be used (if \'--clusters\' is not specified)') parser.add_argument('--initial-weights', type=str, help='VGG weights in hdf5 format') parser.add_argument( '--num-gpus', type=int, default=1, help='How many GPUs should we use? (-1 for all available GPUs)') parser.add_argument('train_dir', help='output directory for the model and log files') args = parser.parse_args() try: os.makedirs(args.train_dir) except: pass cluster_file = args.train_dir + 'clusters.npy' if args.clusters: import shutil shutil.copyfile(args.clusters, cluster_file) if args.n_clusters is not None: print( 'Warning clusters and n_clusters both specified! Ignoring n_clusters.' ) elif not os.path.exists(cluster_file): print('No cluster file provided, clustering (this might take a while)') from cluster import cluster cluster(args.file_list, cluster_file, args.n_clusters) args.n_clusters = getNCluster(cluster_file) files = [ os.path.join(args.file_base_dir, l.strip()) for l in open(args.file_list, 'r') ] # Fire up tensorflow import tensorflow as tf # Detect the number of GPUs if args.num_gpus < 0: args.num_gpus = getNumGPU() print('Found %d GPUs. Using all of them' % args.num_gpus) ## Setup the graph ## # Get the data gpu_batch_size = (args.batch_size - 1) // args.num_gpus + 1 total_batch_size = gpu_batch_size * args.num_gpus data, gt = glocData(files, cluster_file, batch_size=total_batch_size) # Setup the solver solver = tf.train.AdamOptimizer(learning_rate=args.learning_rate) # Setup the network and loss if args.num_gpus != 1: # Multi gpu VGG split_data = tf.split(0, args.num_gpus, data) split_gt = tf.split(0, args.num_gpus, gt) all_loss, all_grads = [], [] vars = None for i, (d, g) in enumerate(zip(split_data, split_gt)): with tf.device('/gpu:%d' % i) as dev: # Define VGG vgg = vgg16(d, n_out=args.n_clusters) # Share the parameters tf.get_variable_scope().reuse_variables() # Compute the loss and gradients (per device) avg_vgg = tf.reduce_mean(tf.reduce_mean(vgg, 1), 1) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(avg_vgg, g)) grads_and_vars = solver.compute_gradients(loss) # Collect all outputs ... all_grads.append([g for g, v in grads_and_vars]) all_loss.append(loss) if vars is None: vars = [v for g, v in grads_and_vars] else: assert np.all([ v == vv for vv, (g, v) in zip(vars, grads_and_vars) ]), "Variables differ between GPUs" # .. and concat or sum them up grads = [tf.add_n(g) / len(g) for g in zip(*all_grads)] grads_and_vars = list(zip(grads, vars)) loss = tf.add_n(all_loss) / len(all_loss) else: vgg = vgg16(data, n_out=args.n_clusters) avg_vgg = tf.reduce_mean(tf.reduce_mean(vgg, 1), 1) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(avg_vgg, gt)) grads_and_vars = solver.compute_gradients(loss) solver_step = solver.apply_gradients(grads_and_vars) # Create some summaries loss_avg = tf.train.ExponentialMovingAverage(0.9, name='avg') tf.scalar_summary('loss', loss) loss_avg_op = loss_avg.apply([loss]) tf.scalar_summary('loss(avg)', loss_avg.average(loss)) with tf.control_dependencies([loss_avg_op]): loss = tf.identity(loss) for grad, var in grads_and_vars: tf.scalar_summary(var.op.name + '/norm', tf.reduce_mean(var * var)) tf.scalar_summary(var.op.name + '/gradient_norm', tf.reduce_mean(grad * grad)) tf.scalar_summary( var.op.name + '/gradient_ratio', tf.reduce_mean(grad * grad) / tf.reduce_mean(var * var)) summary_op = tf.merge_all_summaries() # Initialize ops saver = tf.train.Saver(tf.all_variables()) init_op = tf.initialize_all_variables() if args.initial_weights is not None: from slim import load load_op = load.loadH5(args.initial_weights) else: load_op = tf.no_op() tf.get_default_graph().finalize() with tf.Session(config=tf.ConfigProto( log_device_placement=args.log_device_placement, gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.6))) as sess: # Initialize stuff summary_writer = tf.train.SummaryWriter(args.train_dir, sess.graph) global coord coord = tf.train.Coordinator() import signal def stop(*args): global coord print("Training stopped") coord.request_stop() old_sigint = signal.signal(signal.SIGINT, stop) threads = tf.train.start_queue_runners(sess=sess, coord=coord) sess.run(init_op) sess.run(load_op) # Train loss_values = [] for it in range(args.number_of_iterations): t0 = time() _, loss_value = sess.run([solver_step, loss]) t1 = time() loss_values.append(loss_value) if it % 10 == 0: print('%8d, loss = %0.2f [%0.2f] (%0.1f im/sec)' % (it, loss_value, np.mean(loss_values), args.batch_size / (t1 - t0))) loss_values = loss_values[-20:] if it % 100 == 0: summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, it) if it % 1000 == 0: saver.save(sess, os.path.join(args.train_dir, 'snap.ckpt'), global_step=it) if coord.should_stop(): break saver.save(sess, os.path.join(args.train_dir, 'final.ckpt')) coord.join(threads) signal.signal(signal.SIGINT, old_sigint)
f.write("\n") f.close() def setModel(raw_data): def getModel(): data =raw_data.iloc[:,:-1] target = raw_data.iloc[:, -1] clf = svm.SVC(C=1, kernel='rbf', gamma=1, decision_function_shape='ovo') # clf = svm.SVC(C=1, kernel='linear', decision_function_shape='ovr') clf.fit(data, target) target_hat = clf.predict(data) acc = accuracy_score(target, target_hat) np.set_printoptions(suppress=True) print(u'预测正确的样本个数:%d,正确率:%.2f%%' % (round(acc * len(data)), 100 * acc)) return clf def dump(model, filename): with open(filename, 'wb') as fp: pickle.dump(model, fp) if __name__ == "__main__": clf = getModel() dump(clf, 'model') data = preprocess() data, n = cluster(data) regression(data, n) setModel(data)
results[i-720][0] = deltap weights = fit_weights(vals, results) return weights # Train the model if __name__=="__main__": if (len(sys.argv)) == 1: print "Need csv with training data" quit() # load dataset data = load(sys.argv[1]) # split dataset into 2, skipping every other element # i.e. turn 5s increment into 10s increment cluster_data = data[:len(data)/2][::2] train_data = data[len(data)/2:][::2] # cluster the first part of data clusters = cluster(cluster_data) # fit params using second part of data weights = train(train_data, clusters) # save weights and clusters for later usage pkl.dump(clusters, open("weights/clusters.pkl", "wb")) pkl.dump(weights, open("weights/weights.pkl", "wb"))
means += [statistics.mean(segment)] means = np.array(means) def cluster(signal, ref): part = mixture.BayesianGaussianMixture(n_components=2, max_iter=300) part.fit(signal.reshape(-1, 1)) target = sorted(part.means_) answers = [] for i in signal: if abs(i - target[0]) < abs(i - target[1]): answers += [0] else: answers += [1] print("zero center = \t", part.means_[0]) print("one center = \t", part.means_[1]) print("reference =\t", ref) print("answer =\t", np.array(answers)) correctness = 0 for (a, b) in zip(answers, ref): if a == b: correctness += 1 print("correctness =", correctness / len(answers) * 100) n_chunks = int((len(reference) + 19) / 20) # cluster(means, reference) for signal, reference in zip(np.array_split(means, n_chunks), np.array_split(np.array(reference), n_chunks)): cluster(signal, reference)
parser.add_argument('--mov-dim', '--dm', default=200, type=int) # fix these args = parser.parse_args() # Cluster is_test = False if os.path.exists( _labels_fpath(args.k_usrs, args.usr_dim, args.usr_eigenvectors_file)): usr_km = KMeansData( np.load( _labels_fpath(args.k_usrs, args.usr_dim, args.usr_eigenvectors_file)), is_test) print('loaded user kmeans') else: usr_km = cluster(args.usr_eigenvectors_file, args.usr_eigenvalues_file, args.usr_dim, args.k_usrs) print('built user kmeans') if os.path.exists( _labels_fpath(args.k_movs, args.mov_dim, args.mov_eigenvectors_file)): mov_km = KMeansData( np.load( _labels_fpath(args.k_movs, args.mov_dim, args.mov_eigenvectors_file)), is_test) print('loaded movie kmeans') else: mov_km = cluster(args.mov_eigenvectors_file, args.mov_eigenvalues_file, args.mov_dim, args.k_movs) print('loaded movie kmeans') # Load trainset for inference
spectral = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="nearest_neighbors") # clst = kmeans, dbscan, spectral data_titles = ['noisy_circles', 'noisy_moons', 'blobs', 'no_structure'] def cluster(clst, title): # Create 2x2 figure for each algorithm fig, ax = plt.subplots(2,2, figsize=(16,4) ) datasets = [noisy_circles, noisy_moons, blobs, no_structure] plot_num = 1 for i, dataset in enumerate(datasets): X, y = dataset X = StandardScaler().fit_transform(X) clst.fit(X) y_pred = clst.labels_.astype(np.int) plt.subplot(1, 4, plot_num) plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10) plt.title(data_titles[i]) plot_num +=1 cluster(spectral,'Spectral') cluster(kmeans, 'Kmeans') cluster(dbscan, 'DBscan')
import scipy import scipy.io import sklearn.metrics import sklearn.cluster import sys def cluster(simmatrix,n_clusters,savefile): print n_clusters n_iters = 50 opt_score = -1e4 for i in range(n_iters): print('iter '+str(i)+'\n') labels0 = sklearn.cluster.spectral_clustering(simmatrix, n_clusters) score0 = sklearn.metrics.silhouette_score(simmatrix,labels0,'precomputed') if score0>opt_score: opt_labels = labels0 opt_score = score0 scipy.io.savemat(savefile,{'labels':opt_labels, 'score':opt_score}) return(opt_labels, opt_score) n_clusters = int(sys.argv[1]) A = scipy.io.loadmat('/u/metanet/clustering/may-2014-relations/data/F-sims.txt.mat') a= A['simmatrix'] cluster(a, n_clusters, '/u/metanet/clustering/may-2014-relations/data/test'+str(n_clusters)+'.mat')
feature_dir = os.path.join(root_dir, "output", "features") output_dir = os.getcwd() # stem = '2344_00_32_40_25' stem = "V3736-02" # xml_file = '/Users/dcline/Downloads/TestTopDown/2344_00_32_40_25/output/2344_00_32_40_25.events.xml' xml_files = [ "V3736-02_1_2000.events.xml", "V3736-02_2001_4000.events.xml", "V3736-02_4001_6000.events.xml", "V3736-02_6001_8000.events.xml", "V3736-02_8001_10000.events.xml", "V3736-02_10001_12000.events.xml", "V3736-02_12001_14000.events.xml", "V3736-02_14001_16000.events.xml", "V3736-02_16001_17983.events.xml", ] width = 1920 # 706 height = 1080 # 362 frame_event_set = [] for f in xml_files: fes = utils.parse(os.path.join(root_dir, "output", f)) frame_event_set.append(fes) feature_types = ["PVS", "HOG_3", "JET_red", "HOG_8", "JET_blue", "JET_green"] cluster(feature_types, 64, width, height, frame_event_set, feature_dir, stem, output_dir) print "Done"
def forward(self, x, num_iter=1): embeds = self.encoder(x) embeds = embeds.view(-1, self.encoder_features_num) mu_init, _, _ = cluster(embeds, self.K, 1, num_iter, cluster_temp = self.cluster_temp, init = self.init) mu, r, dist = cluster(embeds, self.K, 1, 1, cluster_temp = self.cluster_temp, init = mu_init.detach().clone()) return r
# clusters.append(y) # # X = X[X[:,0].argsort()] #SORT X BY FIRST COLUMN # return # df = pd.read_csv('PitchFxExample.csv') # D = df.iloc[:,3:].values D1 = np.random.normal(0,1,[100,50]) D2 = np.random.normal(20,1,[100,50]) D3 = np.random.normal(100,1,[100,50]) D = np.concatenate((D1,D2,D3)) r, X = cluster(D) kmeans = cluster.KMeans(n_clusters = 3) kmeans.fit(X) labels = kmeans.labels_ # ind = get_clusters(X) # X_new = get_clusters(X)
#!/usr/bin/env python import argparse import h5py import numpy as np import sklearn.cluster def cluster(arguments): with h5py.File(arguments.file, 'r') as handle: data = np.array(handle['DBSCAN']) dbscan = sklearn.cluster.DBSCAN(eps=arguments.e, min_samples=arguments.m) dbscan.fit(data) with h5py.File('output.h5', 'w') as handle: handle['Clusters'] = dbscan.labels_ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-e', type=float, help='spatial search radius epsilon') parser.add_argument('-m', type=int, help='density threshold min_points') parser.add_argument('file', type=str, help='file to cluster') args = parser.parse_args() cluster(args)
similarity = -1 * np.array([[ metric(getter(str_a), getter(str_b), **kwargs) for str_a in n_samples ] for str_b in n_samples]) print 'Done calculating similarity' affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", verbose=True) affprop.fit(similarity) cluster_ids = np.unique(affprop.labels_) centroids = [ n_samples[affprop.cluster_centers_indices_[cluster_id]] for cluster_id in cluster_ids ] clusters = [ n_samples[np.nonzero(affprop.labels_ == cluster_id)] for cluster_id in cluster_ids ] return centroids, clusters if __name__ == '__main__': samples = ['ala', 'aga', 'abba', 'dupa', 'kupa', 'sraka'] centroids, clusters = cluster(samples, levenshtein_distance) for (cluster_id, cluster) in enumerate(clusters): cluster_str = ", ".join(cluster) print(" - *%s:* %s" % (centroids[cluster_id], cluster_str))
for i in range(len(points)): if scrs[i] == scr: scr_points.append(points[i]) scr_labels.append(labels[i]) pipeline.fit(scr_points, scr_labels) pipelines[scr] = pipeline center_labels = {} progress = progressbar.ProgressBar() for i in progress(range(len(scr_points))): center = pipeline.predict([scr_points[i]])[0] center_labels[center] = center_labels.get(center, []) + [scr_labels[i]] for center in center_labels: print("center: %r" % center, end="") for label in sorted(set(center_labels[center])): print(" %s: %d" % (label, center_labels[center].count(label)), end="") print() if __name__ == '__main__': if sys.argv[1:]: files = sys.argv[1:] else: files = elements.collect_files(DATADIR) cluster(files)