Beispiel #1
0
 def forward(self, embeds, cluster_temp, num_iter=10):
     
     mu_init, _ = cluster(embeds, self.K, 1, num_iter, cluster_temp = torch.tensor(cluster_temp), init = self.init)
     #self.init = mu_init.clone().detach()
     mu, r = cluster(embeds, self.K, 1, 1, cluster_temp = torch.tensor(cluster_temp), init = mu_init.clone().detach())
     
     return mu, r
Beispiel #2
0
def DBSCAN(X: np.ndarray, r: float, minPts: int):
    pointnum = X.shape[0]
    v = visitlist(pointnum)
    clustersSet = list()
    noise = cluster(-1)
    tree = KDTree(X)
    k = 0

    while v.unvisitednum > 0:
        randid = random.choice(v.unvisitedlist)
        v.visit(randid)
        N = tree.query_ball_point(X[randid], r)
        if len(N) < minPts:
            noise.points.append(randid)
        else:
            clus = cluster(k)
            clus.points.append(randid)
            N.remove(randid)
            while len(N) > 0:
                p = N.pop()
                if p in v.unvisitedlist:
                    v.visit(p)
                    clus.points.append(p)
                    pN = tree.query_ball_point(X[p], r)
                    if len(pN) >= minPts:
                        pN.remove(p)
                        N = N + pN
            clustersSet.append(clus)

    clustersSet.append(noise)
    return clustersSet
Beispiel #3
0
def main():
	from argparse import ArgumentParser
	from time import time
	
	parser = ArgumentParser()
	parser.add_argument('--file-list', type=str, default='/fastdata/finder/streetview_train.txt', help='path to the streetview training file')
	parser.add_argument('-n', '--n-clusters', type=int, default=100, help='number of cluster')
	parser.add_argument('--max-files', type=int, help='maximum number of files to cluster')
	parser.add_argument('output', type=str, help='output file (e.g. clusters.npy)')
	args = parser.parse_args()
	
	cluster(args.file_list, args.output, args.n_clusters, args.max_files)
Beispiel #4
0
def clean_data(datalist):
    x = datalist.values[::, 0:14]
    #y = datalist.values[::, 14:]
    df = pd.DataFrame(x)
    df = df.replace("?", float('nan'))
    x = df.fillna(0)  # замена ? на 0
    replace_mas1 = ["notpresent", "yes", "good"]
    replace_mas0 = ["present", "no", "poor"]
    for text in replace_mas1:  #замена notpresent", "yes", "good" на 1
        x = x.replace(text, 1)
    for text in replace_mas0:
        x = x.replace(text, 0)
    cluster(x)
Beispiel #5
0
    def forward(self, x, adj, num_iter=1):
        embeds = self.GCN(x, adj)
        mu_init, _, dist = cluster(embeds,
                                   self.K,
                                   num_iter,
                                   cluster_temp=self.cluster_temp,
                                   init=self.init)
        mu, r, dist_2 = cluster(embeds,
                                self.K,
                                1,
                                cluster_temp=self.cluster_temp,
                                init=mu_init.detach().clone())

        return r, dist
Beispiel #6
0
 def forward(self, x):
     mu_init, _, _ = cluster(x,
                             self.K,
                             1,
                             self.num_iter,
                             cluster_temp=self.cluster_temp,
                             init=self.init)
     mu, r, dist = cluster(x,
                           self.K,
                           1,
                           1,
                           cluster_temp=self.cluster_temp,
                           init=mu_init.detach().clone())
     return r
Beispiel #7
0
    def plot(self, data, views=2, show=False):
        """plot clustering"""

        # get plotting tools
        try:
            from spikeplot import plt, cluster
        except ImportError:
            return None

        # init
        views = min(views, int(data.shape[1] / 2))
        fig = plt.figure()
        fig.suptitle('clustering [%s]' % self.clus_type)
        ax = [fig.add_subplot(2, views, v + 1) for v in xrange(views)]
        axg = fig.add_subplot(212)
        ncmp = int(self.labels.max() + 1)
        cdata = dict(zip(xrange(ncmp),
                         [data[self.labels == c] for c in xrange(ncmp)]))

        # plot clustering
        for v in xrange(views):
            cluster(
                cdata,
                data_dim=(2 * v, 2 * v + 1),
                plot_handle=ax[v],
                plot_mean=sp.sqrt(self.sigma_factor),
                xlabel='PC %d' % int(2 * v),
                ylabel='PC %d' % int(2 * v + 1),
                show=False)

        # plot gof
        axg.plot(self._gof, ls='steps')
        for i in xrange(1, len(self.crange)):
            axg.axvline(i * self.repeats - 0.5, c='y', ls='--')
        axg.axvspan(self._winner - 0.5, self._winner + 0.5, fc='gray',
                    alpha=0.2)
        labels = []
        for k in self.crange:
            labels += ['%d' % k]
            labels += ['.'] * (self.repeats - 1)
        axg.set_xticks(sp.arange(len(labels)))
        axg.set_xticklabels(labels)
        axg.set_xlabel('cluster count and repeats')
        axg.set_ylabel(str(self.gof_type).upper())
        axg.set_xlim(-1, len(labels))

        # show?
        if show is True:
            plt.show()
        return True
Beispiel #8
0
    def __init__(self, page_tree, k_max_depth=2, k_decay=0.5,
                 c_eps=1.2, c_d1=1.0, c_d2=1.0, separate_descendants=True):
        """Perform all extraction operations in sequence.

        Parameters
        ----------
        k_max_depth : int
            Parameter to kernel computation
        k_decay : float
            Parameter to kernel computation
        c_eps : float
            Parameter to clustering
        c_d1 : float
            Parameter to clustering
        c_d2 : float
            Parameter to clustering
        separate_descendants : bool
            Parameter to clustering
        """
        self.page_tree = page_tree
        self.kernel = _ker.kernel(page_tree, max_depth=k_max_depth, decay=k_decay)
        self.labels = cluster(
            page_tree, self.kernel, eps=c_eps, d1=c_d1, d2=c_d2,
            separate_descendants=separate_descendants)
        self.items = extract_items(page_tree, self.labels)
        self.tables = [ItemTable(items, extract_item_table(page_tree, items, self.labels))
                       for items in self.items]
        self.table_fragments = [
            ItemTable([page_tree.fragment_index(np.array(root)) for root in item],
                      page_tree.fragment_index(fields))
            for item, fields in self.tables]
Beispiel #9
0
    def preprocess_y(self,settings):
        '''
        Returns preprocessed observables
        '''
        if self.preprocess_labels == None:
            labels = self.y()[:,settings.label_index]
        else:
            labels = self.preprocess_labels

        Y = []
        # Add a bias to the data by clustering the baseline observables
        # try up to 6 clusters
        for k in range(1,9):
            # cluster the centroids of each label
            #x = self.y()[:,[settings.baseline_index,4,7,9]].astype(np.float)
            x = self.y()[:,settings.baseline_index].astype(np.float)
            y = self.y()[:,settings.observable_index].astype(np.float)

            classification = cluster(k,x, labels)
            self.classification[k-1] = classification.copy()

            y = self.y()[:,settings.observable_index].astype(np.float)

            # preprocess the observables
            y[classification == k-1] = y[classification == k-1] \
                                       - np.median(y[classification == k-1]) \
                                       + np.median(x[classification == k-1])

            Y.append(y.copy())
        return Y
Beispiel #10
0
    def preprocess_y(self, settings):
        '''
        Returns preprocessed observables
        '''
        if self.preprocess_labels == None:
            labels = self.y()[:, settings.label_index]
        else:
            labels = self.preprocess_labels

        Y = []
        # Add a bias to the data by clustering the baseline observables
        # try up to 6 clusters
        for k in range(1, 9):
            # cluster the centroids of each label
            #x = self.y()[:,[settings.baseline_index,4,7,9]].astype(np.float)
            x = self.y()[:, settings.baseline_index].astype(np.float)
            y = self.y()[:, settings.observable_index].astype(np.float)

            classification = cluster(k, x, labels)
            self.classification[k - 1] = classification.copy()

            y = self.y()[:, settings.observable_index].astype(np.float)

            # preprocess the observables
            y[classification == k-1] = y[classification == k-1] \
                                       - np.median(y[classification == k-1]) \
                                       + np.median(x[classification == k-1])

            Y.append(y.copy())
        return Y
Beispiel #11
0
def main():
    n_clusters = int(sys.argv[1])
    print suffix + ' ' + str(n_clusters)
    vocabpath = basepath+'data/vocab'+suffix+'.txt'
    vocab =[word.strip() for word in open(vocabpath).readlines()]
    vocab = vocab[:voc_size]
    matpath = basepath + 'data/F-rels'+str(voc_size)+suffix+'.mat'
    datapath = basepath+'data/all-BNC-EN.txt'
    print str(n_clusters)
    if load<1:
        fmatrix = matbuild(datapath,vocab)
        scipy.io.savemat(matpath,{'fmatrix':fmatrix})
    else:
        a = scipy.io.loadmat(matpath)
        fmatrix = a['fmatrix']
    for i in range(numpy.size(fmatrix,0)):
        fmatrix[i,:] = fmatrix[i,:]/sum(fmatrix[i,:]+eps)
    if load<2:
        simmatrix = simbuild(fmatrix)
        scipy.io.savemat(basepath+'data/F-sims'+str(voc_size)+suffix+'.txt.mat', {'simmatrix': simmatrix})
    else:
        a = scipy.io.loadmat(basepath+'data/F-sims'+str(voc_size)+suffix+'.txt.mat')
        simmatrix = a['simmatrix']
    outputfile = basepath+'data/optlabels-'+str(n_clusters)+'-'+str(voc_size)+suffix+'.mat'
    if load<3:
        (labels,score) = cluster(simmatrix, n_clusters, outputfile)
    else:
        A = scipy.io.loadmat(outputfile)
        labels = A['labels']
    outpath = basepath + 'data/output'+str(n_clusters)+'-'+str(voc_size)+suffix+'.txt'
    outwrite(vocab,labels, outpath)
Beispiel #12
0
def step(dataset, nn):
    # 1.  Do a forward pass
    dataset['y'] = nn.fwd(dataset['x'])

    # 2. Perform clustering
    clusters = cluster(dataset)

    # 3. Calculate the distances between the distributions
    distances = calc_distances(clusters)

    # 4. Displace the center of each cluster using force directed graph
    for c in clusters:
        c.calculate_displacement(clusters)

    # 5. Set the target position of each item in the cluster to the mean of the cluster
    x, y = extract_training_data(clusters)

    # 6. Train the CNN\
    for j in range(100):
        nn.train(x, y)

    # 7. Plot the clusters
    plot_clusters(clusters)

    return clusters
Beispiel #13
0
def cluster_all_targets(language, N=8):
    logging.info('clustering {}'.format(lang))
    data = load(language)
    results = []
    for w in tqdm.tqdm(data.words):
        results.append((w, *cluster(data, w)))
    return results
Beispiel #14
0
def do_segmentation(C, M, config, in_bound_idxs=None):
    embedding = embed_beats(C, M, config)
    Cnorm = np.cumsum(embedding ** 2, axis=1) ** 0.5

    if config["hier"]:
        est_idxs = []
        est_labels = []
        for k in range(1, config["num_layers"] + 1):
            est_idx, est_label = cluster(embedding, Cnorm, k)
            est_idxs.append(est_idx)
            est_labels.append(np.asarray(est_label, dtype=np.int))

    else:
        est_idxs, est_labels = cluster(embedding, Cnorm, config["scluster_k"], in_bound_idxs)
        est_labels = np.asarray(est_labels, dtype=np.int)

    return est_idxs, est_labels, Cnorm
Beispiel #15
0
def do_segmentation(C, M, config, in_bound_idxs=None):
    embedding = embed_beats(C, M, config)
    Cnorm = np.cumsum(embedding**2, axis=1)**0.5

    if config["hier"]:
        est_idxs = []
        est_labels = []
        for k in range(1, config["num_layers"] + 1):
            est_idx, est_label = cluster(embedding, Cnorm, k)
            est_idxs.append(est_idx)
            est_labels.append(np.asarray(est_label, dtype=np.int))

    else:
        est_idxs, est_labels = cluster(embedding, Cnorm, config["scluster_k"])
        est_labels = np.asarray(est_labels, dtype=np.int)

    return est_idxs, est_labels, Cnorm
Beispiel #16
0
def labeladjust(lang, n, lofeatures, feature, df):
    table = cluster(lang, n, lofeatures, df)
    d = [0] * n
    for i in range(0, n):
        d[i] = (i, min(table.loc[table['label'] == i, feature]))
    order = sorted(d, key=lambda x: x[1])
    dic = {}
    for i in range(0, n):
        dic[order[i][0]] = i
    table['label'] = table['label'].replace(dic)
    return (table)
Beispiel #17
0
def getbreaks(lang, n, lofeatures, feature, df):
    table = cluster(lang, n, lofeatures, df)
    minmax = [0] * n
    for i in range(0, n):
        minmax[i] = (min(table.loc[table['label'] == i, feature]),
                     max(table.loc[table['label'] == i, feature]))
    breaks = [0] * (n - 1)
    ordered = sorted(minmax)
    for i in range(0, n - 1):
        breaks[i] = numpy.mean([ordered[i][1], ordered[(i + 1)][0]])
    return (breaks)
def laplacian_segmentation(matrix):  #pass in an affinity matrix
    matrix = np.matrix(matrix, dtype=int)
    matrix = np.maximum(matrix, matrix.transpose())

    embedding = decompose(matrix)
    Cnorm = np.cumsum(embedding**2, axis=1)**0.5

    segmentations = []
    for k in range(1, MAX_TYPES):
        segmentations.append(cluster(embedding, Cnorm, k))

    return reindex(segmentations)
Beispiel #19
0
def main():
	# get command line aguments
    args = sys.argv

    # m - # of sets of sweetwords
    # n - # of sweetwords in each set
    m = int(args[1])
    n = int(args[2])
    input_file = args[3]

	# store input passwords
    password_list = []
    sweetwords = read_password_file(input_file)

    filename = "password_choice.txt"
	# Identify password for each set of sweetwords

    for row in range(0,m):

        sweetwords_list = []
        sep = ','
        sweetwords_list.append(sweetwords[row].split(sep,n))
        roots = cluster(sweetwords_list, n)
        root_list = []

        for key,val in roots.items():
            # print(key,val)
            length = len(val)
            root_list.append([key,length])

            # root_list.append(key)
        root_list=sorted(root_list,key=lambda x: x[1])

        root_list_processed =[]
        # print(root_list)
        if len(root_list) > 5:
            root_list_processed = [root_list[0][0],root_list[1][0],root_list[-1][0],root_list[-2][0]]
        else:
            for item in root_list:
                root_list_processed.append(item[0])
        print(root_list_processed)
        shuffle(root_list_processed)
        chosen = choose_pass(root_list_processed)
        print("Final Answer", chosen)

        # password_choice = "".join(str(x) for x in passSelect(roots)[0])
        # print("Password Guess #",row, ": ", password_choice)

        password_choice_ind = sweetwords_list[0].index(chosen)
        password_list.append(str(password_choice_ind))

    write_passwordchoice_file(filename, password_list)
Beispiel #20
0
 def _run_training_epoch(self, **kwargs):
     """
     
     """
     # predict clusters for each data point
     predictions = self._pred_model.predict(self._pred_ds, steps=self._pred_steps)
     
     # if test data was included- also predict outputs for those
     if self._test:
         test_preds = self._pred_model.predict(self._test_ds, steps=self._test_steps)
     else:
         test_preds = None
     
     # run k-means
     y_e, clusters, test_labels = cluster(predictions, 
                             self.config["pca_dim"], 
                             self.config["k"], init='k-means++',
                             kmeans_max_iter=self.config["kmeans_max_iter"],
                             kmeans_batch_size=self.config["kmeans_batch_size"],
                             testvecs=test_preds)
     self._old_test_labels = self._test_labels
     self._test_labels = test_labels
     
     # record the normalized mutual information between these labels and previous
     if self._old_cluster_assignments is not None:
         nmi = normalized_mutual_info_score(y_e, self._old_cluster_assignments,
                                            average_method="arithmetic")
         self._record_scalars(normalized_mutual_information=nmi)
     self._old_cluster_assignments = y_e
     
     
     # reset the weights of the output layer
     new_weights = [self._initializer(x.shape) for x in 
                    self._output_layer.get_weights()]
     self._output_layer.set_weights(new_weights)
     
     # do some training
     train_ds, num_steps = stratified_training_dataset(self.trainingdata, y_e, 
                                 imshape=self.input_config["imshape"],
                                 num_channels=self.input_config["num_channels"],
                                 num_parallel_calls=self.input_config["num_parallel_calls"],
                                 batch_size=self.input_config["batch_size"], 
                                 mult=self.config["mult"],
                                 augment=self.augment_config,
                                 sobel=self.input_config["sobel"],
                                 single_channel=self.input_config["single_channel"])
     
     for x, y in train_ds:
         loss = self._training_step(x, y, self._models["full"], 
                                   self._optimizer)
         self._record_scalars(training_crossentropy=loss)
         self.step += 1
Beispiel #21
0
 def forward(self, x, adj, num_iter=1):
     # print("INSIDE FORWARD")
     embeds = self.GIN(x, adj)
     # print("EMBEDS\n\n",embeds)
     mu_init, _, _ = cluster(embeds,
                             self.K,
                             1,
                             num_iter,
                             cluster_temp=self.cluster_temp,
                             init=self.init)
     mu, r, dist = cluster(embeds,
                           self.K,
                           1,
                           1,
                           cluster_temp=self.cluster_temp,
                           init=mu_init.detach().clone())
     # print("PRINTING")
     # print("mu\n\n",mu)
     # print("r\n\n",r)
     # print("embeds\n\n",embeds)
     # print("dist\n\n",dist)
     return mu, r, embeds, dist
Beispiel #22
0
def main():
    from argparse import ArgumentParser
    from time import time

    parser = ArgumentParser()
    parser.add_argument('--file-list',
                        type=str,
                        default='/fastdata/finder/streetview_train.txt',
                        help='path to the streetview training file')
    parser.add_argument('-n',
                        '--n-clusters',
                        type=int,
                        default=100,
                        help='number of cluster')
    parser.add_argument('--max-files',
                        type=int,
                        help='maximum number of files to cluster')
    parser.add_argument('output',
                        type=str,
                        help='output file (e.g. clusters.npy)')
    args = parser.parse_args()

    cluster(args.file_list, args.output, args.n_clusters, args.max_files)
def recluster(df, cl, clusters, n_clusters):
    lbls = cl.labels_
    mask = np.array([False for i in range(len(lbls))])
    for c in clusters:
        mask |= lbls==c
    subpipe, results = data_pipeline(df[mask])
    
    ##use cosine similarity! NLTK clustering implementation
    #KMeans cluster object as carrier for consistency
    subcl = cluster(results, n_clusters)
    kclusterer = KMeansClusterer(n_clusters, distance=nltk.cluster.util.cosine_distance, repeats=50)
    assigned_clusters = kclusterer.cluster(results, assign_clusters=True)
    #assign new cluster labels and cluster centroids
    subcl.labels_ = np.array(assigned_clusters)
    subcl.cluster_centers_ = np.array(kclusterer.means())
    
    return subpipe, subcl, results, df[mask]
Beispiel #24
0
def segment_file(filename):
    print('Loading {}'.format(filename))
    y, sr = librosa.load(filename)

    print('Extracting features...'.format(filename))
    Csync, Msync, beat_times = make_beat_sync_features(y=y, sr=sr)

    print('Constructing embedding...'.format(filename))
    embedding = embed_beats(Csync, Msync)

    Cnorm = np.cumsum(embedding**2, axis=1)**0.5

    print('Clustering...'.format(filename))
    segmentations = []
    for k in range(1, MAX_TYPES):
        print('\tk={}'.format(k))
        segmentations.append(cluster(embedding, Cnorm, k, beat_times))

    print('done.')
    return reindex(segmentations)
Beispiel #25
0
def main():
	from argparse import ArgumentParser
	from time import time

	parser = ArgumentParser()
	parser.add_argument('-bs', '--batch-size', type=int, default=32, help='batch size')
	parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4, help='learning rate')
	parser.add_argument('-nit','--number-of-iterations', type=int, default=1000000, help='number of iterations')
	parser.add_argument('--log_device_placement', action='store_true')
	parser.add_argument('--file-list', type=str, default='/fastdata/finder/streetview_train.txt', help='path to the streetview training file')
	parser.add_argument('--file-base-dir', type=str, default='/fastdata/finder/streetview/', help='directory of the training images')
	parser.add_argument('--clusters', type=str, default=None, help='cluster file (computed with cluster.py)')
	parser.add_argument('-n', '--n-clusters', type=int, default=None, help='Number of clusters to be used (if \'--clusters\' is not specified)')
	parser.add_argument('--initial-weights', type=str, help='VGG weights in hdf5 format')
	parser.add_argument('--num-gpus', type=int, default=1, help='How many GPUs should we use? (-1 for all available GPUs)')
	parser.add_argument('train_dir', help='output directory for the model and log files')
	args = parser.parse_args()

	try: os.makedirs(args.train_dir)
	except: pass
	cluster_file = args.train_dir+'clusters.npy'

	if args.clusters:
		import shutil
		shutil.copyfile(args.clusters, cluster_file)
		if args.n_clusters is not None:
			print( 'Warning clusters and n_clusters both specified! Ignoring n_clusters.' )
	elif not os.path.exists(cluster_file):
		print( 'No cluster file provided, clustering (this might take a while)' )
		from cluster import cluster
		cluster(args.file_list, cluster_file, args.n_clusters)
	args.n_clusters = getNCluster(cluster_file)

	files = [os.path.join(args.file_base_dir,l.strip()) for l in open(args.file_list,'r')]

	# Fire up tensorflow
	import tensorflow as tf
	
	# Detect the number of GPUs
	if args.num_gpus < 0:
		args.num_gpus = getNumGPU()
		print( 'Found %d GPUs. Using all of them'%args.num_gpus )

	## Setup the graph ##
	# Get the data
	gpu_batch_size = (args.batch_size-1) // args.num_gpus + 1
	total_batch_size = gpu_batch_size * args.num_gpus
	data,gt = glocData(files, cluster_file, batch_size=total_batch_size)
	
	# Setup the solver
	solver = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
	
	# Setup the network and loss
	if args.num_gpus != 1:
		# Multi gpu VGG
		split_data = tf.split(0, args.num_gpus, data)
		split_gt = tf.split(0, args.num_gpus, gt)
		all_loss, all_grads = [], []
		vars = None
		for i,(d,g) in enumerate(zip(split_data,split_gt)):
			with tf.device('/gpu:%d'%i) as dev:
				# Define VGG
				vgg = vgg16(d, n_out=args.n_clusters)
				
				# Share the parameters
				tf.get_variable_scope().reuse_variables()
				
				# Compute the loss and gradients (per device)
				avg_vgg = tf.reduce_mean(tf.reduce_mean(vgg,1),1)
				loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(avg_vgg, g))
				grads_and_vars = solver.compute_gradients(loss)
				
				# Collect all outputs ...
				all_grads.append( [g for g,v in grads_and_vars] )
				all_loss.append( loss )
				if vars is None:
					vars = [v for g,v in grads_and_vars]
				else:
					assert np.all([v==vv for vv,(g,v) in zip(vars,grads_and_vars)]), "Variables differ between GPUs"
		# .. and concat or sum them up
		grads = [tf.add_n(g)/len(g) for g in zip(*all_grads)]
		grads_and_vars = list(zip(grads, vars))
		loss = tf.add_n(all_loss) / len(all_loss)
	else:
		vgg = vgg16(data, n_out=args.n_clusters)
		avg_vgg = tf.reduce_mean(tf.reduce_mean(vgg,1),1)
		loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(avg_vgg, gt))
		grads_and_vars = solver.compute_gradients(loss)
	
	solver_step = solver.apply_gradients(grads_and_vars)
	
	# Create some summaries
	loss_avg = tf.train.ExponentialMovingAverage(0.9, name='avg')
	tf.scalar_summary('loss', loss)
	loss_avg_op = loss_avg.apply([loss])
	tf.scalar_summary('loss(avg)', loss_avg.average(loss))
	with tf.control_dependencies([loss_avg_op]):
		loss = tf.identity(loss)
	
	for grad,var in grads_and_vars:
		tf.scalar_summary(var.op.name+'/norm', tf.reduce_mean(var*var))
		tf.scalar_summary(var.op.name+'/gradient_norm', tf.reduce_mean(grad*grad))
		tf.scalar_summary(var.op.name+'/gradient_ratio', tf.reduce_mean(grad*grad) / tf.reduce_mean(var*var))
	summary_op = tf.merge_all_summaries()
	
	# Initialize ops
	saver = tf.train.Saver(tf.all_variables())
	init_op = tf.initialize_all_variables()
	if args.initial_weights is not None:
		from slim import load
		load_op = load.loadH5(args.initial_weights)
	else:
		load_op = tf.no_op()
	
	tf.get_default_graph().finalize()
	
	with tf.Session(config=tf.ConfigProto(log_device_placement=args.log_device_placement, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.6))) as sess:
		# Initialize stuff
		summary_writer = tf.train.SummaryWriter(args.train_dir, sess.graph)
		
		global coord
		coord = tf.train.Coordinator()
		import signal
		def stop(*args):
			global coord
			print("Training stopped")
			coord.request_stop()
		old_sigint = signal.signal(signal.SIGINT, stop)

		threads=tf.train.start_queue_runners(sess=sess, coord=coord)
		sess.run(init_op)
		sess.run(load_op)
		
		# Train
		loss_values = []
		for it in range(args.number_of_iterations):
			t0 = time()
			_, loss_value = sess.run([solver_step, loss])
			t1 = time()
			loss_values.append( loss_value )
		
			if it % 10 == 0:
				print('%8d, loss = %0.2f [%0.2f] (%0.1f im/sec)'%(it, loss_value, np.mean(loss_values), args.batch_size / (t1-t0)))
				loss_values = loss_values[-20:]
			if it % 100 == 0:
				summary_str = sess.run(summary_op)
				summary_writer.add_summary(summary_str, it)
			if it % 1000 == 0:
				saver.save(sess, os.path.join(args.train_dir, 'snap.ckpt'), global_step=it)
			if coord.should_stop():
				break
		saver.save(sess, os.path.join(args.train_dir, 'final.ckpt'))
		
		coord.join(threads)
		signal.signal(signal.SIGINT, old_sigint)
Beispiel #26
0
def main():
    from argparse import ArgumentParser
    from time import time

    parser = ArgumentParser()
    parser.add_argument('-bs',
                        '--batch-size',
                        type=int,
                        default=32,
                        help='batch size')
    parser.add_argument('-lr',
                        '--learning-rate',
                        type=float,
                        default=1e-4,
                        help='learning rate')
    parser.add_argument('-nit',
                        '--number-of-iterations',
                        type=int,
                        default=1000000,
                        help='number of iterations')
    parser.add_argument('--log_device_placement', action='store_true')
    parser.add_argument('--file-list',
                        type=str,
                        default='/fastdata/finder/streetview_train.txt',
                        help='path to the streetview training file')
    parser.add_argument('--file-base-dir',
                        type=str,
                        default='/fastdata/finder/streetview/',
                        help='directory of the training images')
    parser.add_argument('--clusters',
                        type=str,
                        default=None,
                        help='cluster file (computed with cluster.py)')
    parser.add_argument(
        '-n',
        '--n-clusters',
        type=int,
        default=None,
        help=
        'Number of clusters to be used (if \'--clusters\' is not specified)')
    parser.add_argument('--initial-weights',
                        type=str,
                        help='VGG weights in hdf5 format')
    parser.add_argument(
        '--num-gpus',
        type=int,
        default=1,
        help='How many GPUs should we use? (-1 for all available GPUs)')
    parser.add_argument('train_dir',
                        help='output directory for the model and log files')
    args = parser.parse_args()

    try:
        os.makedirs(args.train_dir)
    except:
        pass
    cluster_file = args.train_dir + 'clusters.npy'

    if args.clusters:
        import shutil
        shutil.copyfile(args.clusters, cluster_file)
        if args.n_clusters is not None:
            print(
                'Warning clusters and n_clusters both specified! Ignoring n_clusters.'
            )
    elif not os.path.exists(cluster_file):
        print('No cluster file provided, clustering (this might take a while)')
        from cluster import cluster
        cluster(args.file_list, cluster_file, args.n_clusters)
    args.n_clusters = getNCluster(cluster_file)

    files = [
        os.path.join(args.file_base_dir, l.strip())
        for l in open(args.file_list, 'r')
    ]

    # Fire up tensorflow
    import tensorflow as tf

    # Detect the number of GPUs
    if args.num_gpus < 0:
        args.num_gpus = getNumGPU()
        print('Found %d GPUs. Using all of them' % args.num_gpus)

    ## Setup the graph ##
    # Get the data
    gpu_batch_size = (args.batch_size - 1) // args.num_gpus + 1
    total_batch_size = gpu_batch_size * args.num_gpus
    data, gt = glocData(files, cluster_file, batch_size=total_batch_size)

    # Setup the solver
    solver = tf.train.AdamOptimizer(learning_rate=args.learning_rate)

    # Setup the network and loss
    if args.num_gpus != 1:
        # Multi gpu VGG
        split_data = tf.split(0, args.num_gpus, data)
        split_gt = tf.split(0, args.num_gpus, gt)
        all_loss, all_grads = [], []
        vars = None
        for i, (d, g) in enumerate(zip(split_data, split_gt)):
            with tf.device('/gpu:%d' % i) as dev:
                # Define VGG
                vgg = vgg16(d, n_out=args.n_clusters)

                # Share the parameters
                tf.get_variable_scope().reuse_variables()

                # Compute the loss and gradients (per device)
                avg_vgg = tf.reduce_mean(tf.reduce_mean(vgg, 1), 1)
                loss = tf.reduce_mean(
                    tf.nn.sparse_softmax_cross_entropy_with_logits(avg_vgg, g))
                grads_and_vars = solver.compute_gradients(loss)

                # Collect all outputs ...
                all_grads.append([g for g, v in grads_and_vars])
                all_loss.append(loss)
                if vars is None:
                    vars = [v for g, v in grads_and_vars]
                else:
                    assert np.all([
                        v == vv for vv, (g, v) in zip(vars, grads_and_vars)
                    ]), "Variables differ between GPUs"
        # .. and concat or sum them up
        grads = [tf.add_n(g) / len(g) for g in zip(*all_grads)]
        grads_and_vars = list(zip(grads, vars))
        loss = tf.add_n(all_loss) / len(all_loss)
    else:
        vgg = vgg16(data, n_out=args.n_clusters)
        avg_vgg = tf.reduce_mean(tf.reduce_mean(vgg, 1), 1)
        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(avg_vgg, gt))
        grads_and_vars = solver.compute_gradients(loss)

    solver_step = solver.apply_gradients(grads_and_vars)

    # Create some summaries
    loss_avg = tf.train.ExponentialMovingAverage(0.9, name='avg')
    tf.scalar_summary('loss', loss)
    loss_avg_op = loss_avg.apply([loss])
    tf.scalar_summary('loss(avg)', loss_avg.average(loss))
    with tf.control_dependencies([loss_avg_op]):
        loss = tf.identity(loss)

    for grad, var in grads_and_vars:
        tf.scalar_summary(var.op.name + '/norm', tf.reduce_mean(var * var))
        tf.scalar_summary(var.op.name + '/gradient_norm',
                          tf.reduce_mean(grad * grad))
        tf.scalar_summary(
            var.op.name + '/gradient_ratio',
            tf.reduce_mean(grad * grad) / tf.reduce_mean(var * var))
    summary_op = tf.merge_all_summaries()

    # Initialize ops
    saver = tf.train.Saver(tf.all_variables())
    init_op = tf.initialize_all_variables()
    if args.initial_weights is not None:
        from slim import load
        load_op = load.loadH5(args.initial_weights)
    else:
        load_op = tf.no_op()

    tf.get_default_graph().finalize()

    with tf.Session(config=tf.ConfigProto(
            log_device_placement=args.log_device_placement,
            gpu_options=tf.GPUOptions(
                per_process_gpu_memory_fraction=0.6))) as sess:
        # Initialize stuff
        summary_writer = tf.train.SummaryWriter(args.train_dir, sess.graph)

        global coord
        coord = tf.train.Coordinator()
        import signal

        def stop(*args):
            global coord
            print("Training stopped")
            coord.request_stop()

        old_sigint = signal.signal(signal.SIGINT, stop)

        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        sess.run(init_op)
        sess.run(load_op)

        # Train
        loss_values = []
        for it in range(args.number_of_iterations):
            t0 = time()
            _, loss_value = sess.run([solver_step, loss])
            t1 = time()
            loss_values.append(loss_value)

            if it % 10 == 0:
                print('%8d, loss = %0.2f [%0.2f] (%0.1f im/sec)' %
                      (it, loss_value, np.mean(loss_values), args.batch_size /
                       (t1 - t0)))
                loss_values = loss_values[-20:]
            if it % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, it)
            if it % 1000 == 0:
                saver.save(sess,
                           os.path.join(args.train_dir, 'snap.ckpt'),
                           global_step=it)
            if coord.should_stop():
                break
        saver.save(sess, os.path.join(args.train_dir, 'final.ckpt'))

        coord.join(threads)
        signal.signal(signal.SIGINT, old_sigint)
Beispiel #27
0
                f.write("\n")

        f.close()


def setModel(raw_data):
    def getModel():
        data =raw_data.iloc[:,:-1]
        target = raw_data.iloc[:, -1]
        clf = svm.SVC(C=1, kernel='rbf', gamma=1, decision_function_shape='ovo')
        # clf = svm.SVC(C=1, kernel='linear', decision_function_shape='ovr')
        clf.fit(data, target)
        target_hat = clf.predict(data)
        acc = accuracy_score(target, target_hat)
        np.set_printoptions(suppress=True)
        print(u'预测正确的样本个数:%d,正确率:%.2f%%' % (round(acc * len(data)), 100 * acc))
        return clf

    def dump(model, filename):
        with open(filename, 'wb') as fp:
            pickle.dump(model, fp)

    if __name__ == "__main__":
        clf = getModel()
        dump(clf, 'model')


data = preprocess()
data, n = cluster(data)
regression(data, n)
setModel(data)
Beispiel #28
0
        results[i-720][0] = deltap

    weights = fit_weights(vals, results)

    return weights


# Train the model
if __name__=="__main__":
    if (len(sys.argv)) == 1:
        print "Need csv with training data"
        quit()

    # load dataset
    data = load(sys.argv[1])

    # split dataset into 2, skipping every other element
    # i.e. turn 5s increment into 10s increment
    cluster_data = data[:len(data)/2][::2]
    train_data = data[len(data)/2:][::2]

    # cluster the first part of data
    clusters = cluster(cluster_data)

    # fit params using second part of data
    weights = train(train_data, clusters)

    # save weights and clusters for later usage
    pkl.dump(clusters, open("weights/clusters.pkl", "wb"))
    pkl.dump(weights,  open("weights/weights.pkl", "wb"))
Beispiel #29
0
    means += [statistics.mean(segment)]
means = np.array(means)


def cluster(signal, ref):
    part = mixture.BayesianGaussianMixture(n_components=2, max_iter=300)
    part.fit(signal.reshape(-1, 1))
    target = sorted(part.means_)
    answers = []
    for i in signal:
        if abs(i - target[0]) < abs(i - target[1]):
            answers += [0]
        else:
            answers += [1]
    print("zero center = \t", part.means_[0])
    print("one center = \t", part.means_[1])
    print("reference =\t", ref)
    print("answer =\t", np.array(answers))
    correctness = 0
    for (a, b) in zip(answers, ref):
        if a == b:
            correctness += 1
    print("correctness =", correctness / len(answers) * 100)


n_chunks = int((len(reference) + 19) / 20)
# cluster(means, reference)
for signal, reference in zip(np.array_split(means, n_chunks),
                             np.array_split(np.array(reference), n_chunks)):
    cluster(signal, reference)
    parser.add_argument('--mov-dim', '--dm', default=200,
                        type=int)  # fix these
    args = parser.parse_args()

    # Cluster
    is_test = False
    if os.path.exists(
            _labels_fpath(args.k_usrs, args.usr_dim,
                          args.usr_eigenvectors_file)):
        usr_km = KMeansData(
            np.load(
                _labels_fpath(args.k_usrs, args.usr_dim,
                              args.usr_eigenvectors_file)), is_test)
        print('loaded user kmeans')
    else:
        usr_km = cluster(args.usr_eigenvectors_file, args.usr_eigenvalues_file,
                         args.usr_dim, args.k_usrs)
        print('built user kmeans')
    if os.path.exists(
            _labels_fpath(args.k_movs, args.mov_dim,
                          args.mov_eigenvectors_file)):
        mov_km = KMeansData(
            np.load(
                _labels_fpath(args.k_movs, args.mov_dim,
                              args.mov_eigenvectors_file)), is_test)
        print('loaded movie kmeans')
    else:
        mov_km = cluster(args.mov_eigenvectors_file, args.mov_eigenvalues_file,
                         args.mov_dim, args.k_movs)
        print('loaded movie kmeans')

    # Load trainset for inference
spectral = cluster.SpectralClustering(n_clusters=2, eigen_solver='arpack', affinity="nearest_neighbors")


# clst = kmeans, dbscan, spectral

data_titles = ['noisy_circles', 'noisy_moons', 'blobs', 'no_structure']

def cluster(clst, title):
    # Create  2x2 figure for each algorithm
    fig, ax = plt.subplots(2,2, figsize=(16,4)  )
    datasets = [noisy_circles, noisy_moons, blobs, no_structure]
    plot_num = 1
    for i, dataset in enumerate(datasets):

        X, y = dataset
        X = StandardScaler().fit_transform(X)
        clst.fit(X)
        y_pred = clst.labels_.astype(np.int)
        plt.subplot(1, 4, plot_num)

        plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)
        plt.title(data_titles[i])
        plot_num +=1

cluster(spectral,'Spectral')

cluster(kmeans, 'Kmeans')
cluster(dbscan, 'DBscan')


import scipy
import scipy.io
import sklearn.metrics
import sklearn.cluster
import sys

def cluster(simmatrix,n_clusters,savefile):
 print n_clusters
 n_iters = 50
 opt_score = -1e4
 for i in range(n_iters):
  print('iter '+str(i)+'\n')
  labels0 = sklearn.cluster.spectral_clustering(simmatrix, n_clusters)
  score0 = sklearn.metrics.silhouette_score(simmatrix,labels0,'precomputed')
  if score0>opt_score:
   opt_labels = labels0
   opt_score = score0
   scipy.io.savemat(savefile,{'labels':opt_labels, 'score':opt_score})
 return(opt_labels, opt_score)

n_clusters = int(sys.argv[1])
A = scipy.io.loadmat('/u/metanet/clustering/may-2014-relations/data/F-sims.txt.mat')
a= A['simmatrix']
cluster(a, n_clusters, '/u/metanet/clustering/may-2014-relations/data/test'+str(n_clusters)+'.mat')
Beispiel #33
0
    feature_dir = os.path.join(root_dir, "output", "features")
    output_dir = os.getcwd()
    # stem = '2344_00_32_40_25'
    stem = "V3736-02"
    # xml_file = '/Users/dcline/Downloads/TestTopDown/2344_00_32_40_25/output/2344_00_32_40_25.events.xml'
    xml_files = [
        "V3736-02_1_2000.events.xml",
        "V3736-02_2001_4000.events.xml",
        "V3736-02_4001_6000.events.xml",
        "V3736-02_6001_8000.events.xml",
        "V3736-02_8001_10000.events.xml",
        "V3736-02_10001_12000.events.xml",
        "V3736-02_12001_14000.events.xml",
        "V3736-02_14001_16000.events.xml",
        "V3736-02_16001_17983.events.xml",
    ]

    width = 1920  # 706
    height = 1080  # 362

    frame_event_set = []
    for f in xml_files:
        fes = utils.parse(os.path.join(root_dir, "output", f))
        frame_event_set.append(fes)

    feature_types = ["PVS", "HOG_3", "JET_red", "HOG_8", "JET_blue", "JET_green"]
    cluster(feature_types, 64, width, height, frame_event_set, feature_dir, stem, output_dir)


print "Done"
Beispiel #34
0
 def forward(self, x, num_iter=1):
     embeds = self.encoder(x)
     embeds = embeds.view(-1, self.encoder_features_num)
     mu_init, _, _ = cluster(embeds, self.K, 1, num_iter, cluster_temp = self.cluster_temp, init = self.init)
     mu, r, dist = cluster(embeds, self.K, 1, 1, cluster_temp = self.cluster_temp, init = mu_init.detach().clone())
     return r
# 			clusters.append(y)
			


# 	# X = X[X[:,0].argsort()] #SORT X BY FIRST COLUMN
	
	
# 	return 

# df = pd.read_csv('PitchFxExample.csv')
# D = df.iloc[:,3:].values
D1 = np.random.normal(0,1,[100,50])
D2 = np.random.normal(20,1,[100,50])
D3 = np.random.normal(100,1,[100,50])
D = np.concatenate((D1,D2,D3))


r, X = cluster(D)
kmeans = cluster.KMeans(n_clusters = 3)
kmeans.fit(X)
labels = kmeans.labels_
# ind = get_clusters(X)
# X_new = get_clusters(X)







Beispiel #36
0
#!/usr/bin/env python

import argparse
import h5py
import numpy as np
import sklearn.cluster


def cluster(arguments):
    with h5py.File(arguments.file, 'r') as handle:
        data = np.array(handle['DBSCAN'])

    dbscan = sklearn.cluster.DBSCAN(eps=arguments.e, min_samples=arguments.m)
    dbscan.fit(data)

    with h5py.File('output.h5', 'w') as handle:
        handle['Clusters'] = dbscan.labels_


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', type=float, help='spatial search radius epsilon')
    parser.add_argument('-m', type=int, help='density threshold min_points')
    parser.add_argument('file', type=str, help='file to cluster')
    args = parser.parse_args()

    cluster(args)
Beispiel #37
0
    similarity = -1 * np.array([[
        metric(getter(str_a), getter(str_b), **kwargs) for str_a in n_samples
    ] for str_b in n_samples])
    print 'Done calculating similarity'

    affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed",
                                                  verbose=True)
    affprop.fit(similarity)

    cluster_ids = np.unique(affprop.labels_)
    centroids = [
        n_samples[affprop.cluster_centers_indices_[cluster_id]]
        for cluster_id in cluster_ids
    ]
    clusters = [
        n_samples[np.nonzero(affprop.labels_ == cluster_id)]
        for cluster_id in cluster_ids
    ]

    return centroids, clusters


if __name__ == '__main__':
    samples = ['ala', 'aga', 'abba', 'dupa', 'kupa', 'sraka']

    centroids, clusters = cluster(samples, levenshtein_distance)

    for (cluster_id, cluster) in enumerate(clusters):
        cluster_str = ", ".join(cluster)
        print(" - *%s:* %s" % (centroids[cluster_id], cluster_str))
Beispiel #38
0
        for i in range(len(points)):
            if scrs[i] == scr:
                scr_points.append(points[i])
                scr_labels.append(labels[i])

        pipeline.fit(scr_points, scr_labels)
        pipelines[scr] = pipeline

        center_labels = {}
        progress = progressbar.ProgressBar()
        for i in progress(range(len(scr_points))):
            center = pipeline.predict([scr_points[i]])[0]
            center_labels[center] = center_labels.get(center,
                                                      []) + [scr_labels[i]]

        for center in center_labels:
            print("center: %r" % center, end="")
            for label in sorted(set(center_labels[center])):
                print(" %s: %d" % (label, center_labels[center].count(label)),
                      end="")
            print()


if __name__ == '__main__':
    if sys.argv[1:]:
        files = sys.argv[1:]
    else:
        files = elements.collect_files(DATADIR)

    cluster(files)