def remove(self, filename): if os.path.exists(filename): try: ret_code = subprocess.check_call(["rm", filename]) logging.debug("delete local file " + filename) except subprocess.CalledProcessError, e: logging.error(e)
def run_eval_step(self, sess, next_batch): """Runs one evaluation iteration. Returns a dictionary containing summaries, loss, global_step and (optionally) coverage loss. """ articles, abstracts = sess.run(next_batch) for i in range(len(articles)): article = articles[i] abstract = abstracts[i] log.debug('eval i={}\n\narticle={}\n\nabstract={}'.format( i, repr(article), repr(abstract))) log.debug('eval len(articles)={}, len(abstracts)={}'.format( len(articles), len(abstracts))) batch = batcher.to_batch(articles=articles, abstracts=abstracts, vocab=self._vocab, hps=self._hps, pointer_gen=self._pointer_gen) feed_dict = self._make_feed_dict(batch) to_return = { 'summaries': self._summaries, 'loss': self._loss, 'global_step': tf.train.get_global_step() } if self._coverage: to_return['coverage_loss'] = self._coverage_loss return sess.run(to_return, feed_dict)
def make_batch_predictions(self, video_id_batch_val, video_batch_val): """ Make predictions for a batch of videos. Return: Predictions probabilities as a Numpy array. """ topk_video_ids, topk_labels = find_k_nearest_neighbors( video_id_batch_val, video_batch_val, self.train_data_pipeline, is_train=False, k=self.k) logging.debug('topk_video_ids: {}\ntopk_labels: {}'.format( topk_video_ids, topk_labels)) # batch_size * delta. deltas = topk_labels.astype(np.int32).sum(axis=1) batch_predictions_prob = [] for delta in deltas: positive_prob_numerator = np.multiply( self.labels_prior_prob, self.pos_prob_positive[delta, self.range_num_classes]) negative_prob_numerator = np.multiply( 1.0 - self.labels_prior_prob, self.pos_prob_negative[delta, self.range_num_classes]) # predictions = positive_prob_numerator > negative_prob_numerator batch_predictions_prob.append( np.true_divide( positive_prob_numerator, positive_prob_numerator + negative_prob_numerator)) return np.array(batch_predictions_prob, dtype=np.float32)
def attack(self, imgs, targets): """ Perform the EAD attack on the given instance for the given targets. If self.targeted is true, then the targets represents the target labels If self.targeted is false, then targets are the original class labels """ batch_size = self.batch_size r = [] for i in range(0, len(imgs) // batch_size): logging.debug( ("Running EAD attack on instance %s of %s", i * batch_size, len(imgs))) r.extend( self.attack_batch(imgs[i * batch_size:(i + 1) * batch_size], targets[i * batch_size:(i + 1) * batch_size])) if len(imgs) % batch_size != 0: last_elements = len(imgs) - (len(imgs) % batch_size) logging.debug( ("Running EAD attack on instance %s of %s", last_elements, len(imgs))) temp_imgs = np.zeros((batch_size, ) + imgs.shape[2:]) temp_targets = np.zeros((batch_size, ) + targets.shape[2:]) temp_imgs[:(len(imgs) % batch_size)] = imgs[last_elements:] temp_targets[:(len(imgs) % batch_size)] = targets[last_elements:] temp_data = self.attack_batch(temp_imgs, temp_targets) r.extend(temp_data[:(len(imgs) % batch_size)], targets[last_elements:]) return np.array(r)
def input_fn(batch_size): debug("input_fn images shape %s" % (images.shape, )) debug("input_fn labels shape %s" % (labels.shape, )) dataset = tf.data.Dataset.from_tensor_slices((images, labels)) SHUFFLE_SIZE = 5000 dataset = dataset.shuffle(SHUFFLE_SIZE).repeat().batch(batch_size) dataset = dataset.prefetch(None) return dataset
def close(self): self.put_data(None) self.put_msg('Done') self.__blocked = True filename = self.get_data(timeout=2) while filename: self.remove(filename) filename = self.get_data(timout=2) logging.debug("clean data queue && close") self.__closed = True
def attack(self, imgs, targets): """ Perform the L_2 attack on the given instance for the given targets. If self.targeted is true, then the targets represents the target labels If self.targeted is false, then targets are the original class labels """ r = [] for i in range(0, len(imgs), self.batch_size): logging.debug( ("Running CWL2 attack on instance %s of %s", i, len(imgs))) adv = self.attack_batch( imgs[i:i + self.batch_size], targets[i:i + self.batch_size]) r.extend(adv) return np.array(r)
def process(self, element): labels = { constants.SUBDIR_POSITIVE: constants.POSITIVE_SENTIMENT_LABEL, constants.SUBDIR_NEGATIVE: constants.NEGATIVE_SENTIMENT_LABEL } found_labels = [labels[l] for l in labels if l in element] if len(found_labels) > 1: raise ValueError('Incompatible path: `{}`.'.format(element)) if found_labels: with gfile.GFile(element, 'r') as single_file: for line in single_file: yield { constants.LABELS: found_labels[0], constants.REVIEW: line } else: logging.debug('Label not found for file: `%s`.', element)
def step_fn(step_context): articles, abstracts = step_context.session.run(next_batch) for i in range(len(articles)): article = articles[i] abstract = abstracts[i] log.debug('train i={}\n\narticle={}\n\nabstract={}'.format( i, repr(article), repr(abstract))) batch = batcher.to_batch(articles=articles, abstracts=abstracts, vocab=self._vocab, hps=self._hps, pointer_gen=self._pointer_gen) feed_dict = self._make_feed_dict(batch) to_return = { 'train_op': self._train_op, 'summaries': self._summaries, 'loss': self._loss, 'global_step': tf.train.get_global_step() } if self._coverage: to_return['coverage_loss'] = self._coverage_loss return step_context.run_with_hooks(to_return, feed_dict)
def enqueue(self, sess): # first remove used local files #if len(self._curr_files) > 1: # for f in self._curr_files: # self._file_queue.remove(f) self._curr_files = [] # enqueue if self._finished >= self.num_downloader or not self._downloading: return 1 while len(self._curr_files) < self.min_cache: f = self._file_queue.get_data() if f is None: self._finished += 1 continue logging.debug('Got file {}'.format(f)) self._curr_files.append(f) if self._finished >= self.num_downloader or not self._downloading: break outputs = sess.run([self._enqueue_op, self._queue_size_op], feed_dict={self._input_files: self._curr_files}) logging.debug('Output queue size: {}'.format(outputs[1])) self._enqueued += self.min_cache return 0
def __run_training(model, data_dir, coverage, debug, conf, hps): """Repeatedly runs training iterations, logging loss to screen and writing summaries""" log.debug("starting run_training") checkpoint_dir = os.path.join(conf.model_dir, 'train') if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) with model.build_graph().as_default(): summary_writer = tf.summary.FileWriterCache.get(checkpoint_dir) ds = etl.dataset(data_dir, hps.batch_size, shuffle=True, repeat=True) iterator = ds.make_one_shot_iterator() ds_init_op = iterator.make_initializer(ds) next_batch = iterator.get_next() with __session(checkpoint_dir=checkpoint_dir, debug=debug, conf=conf, local_init_ops=[ds_init_op]) as sess: step = 0 # repeats until max_step is reached while not sess.should_stop() and step <= hps.max_step: t0 = time.time() results = model.run_train_step(sess, next_batch) t1 = time.time() loss = results['loss'] if not np.isfinite(loss): raise Exception("Loss is not finite. Stopping.") step = results[ 'global_step'] # we need this to update our running average loss msg = 'train step={}, loss={:.4f}, secs={}'.format( step, loss, int(t1 - t0)) if coverage: coverage_loss = results['coverage_loss'] msg += ", coverage_loss={:.4f}".format(coverage_loss) log.info(msg) # get the summaries and iteration number so we can write summaries to tensorboard summaries = results['summaries'] summary_writer.add_summary(summaries, step) log.info('training done')
def attack_batch(self, imgs, labs): """ Run the attack on a batch of instance and labels. """ def compare(x, y): if not isinstance(x, (float, int, np.int64)): x = np.copy(x) if self.y_target: x[y] -= self.confidence else: x[y] += self.confidence x = np.argmax(x) if self.y_target: return x == y else: return x != y batch_size = self.batch_size oimgs = np.clip(imgs, self.clip_min, self.clip_max) # re-scale instances to be within range [0, 1] imgs = (imgs - self.clip_min) / (self.clip_max - self.clip_min) imgs = np.clip(imgs, 0, 1) # now convert to [-1, 1] imgs = (imgs * 2) - 1 # convert to tanh-space imgs = np.arctanh(imgs * .999999) # set the lower and upper bounds accordingly lower_bound = np.zeros(batch_size) CONST = np.ones(batch_size) * self.initial_const upper_bound = np.ones(batch_size) * 1e10 # placeholders for the best l2, score, and instance attack found so far o_bestl2 = [1e10] * batch_size o_bestscore = [-1] * batch_size o_bestattack = np.copy(oimgs) for outer_step in range(self.binary_search_steps): # completely reset adam's internal state. self.sess.run(self.init) batch = imgs[:batch_size] batchlab = labs[:batch_size] bestl2 = [1e10] * batch_size bestscore = [-1] * batch_size logging.debug(" Binary search step %s of %s", outer_step, self.binary_search_steps) # The last iteration (if we run many steps) repeat the search once. if self.repeat and outer_step == self.binary_search_steps - 1: CONST = upper_bound # set the variables so that we don't have to send them over again self.sess.run( self.setup, { self.assign_timg: batch, self.assign_tlab: batchlab, self.assign_const: CONST }) prev = 1e6 for iteration in range(self.max_iterations): # perform the attack _, l, l2s, scores, nimg = self.sess.run([ self.train, self.loss, self.l2dist, self.output, self.newimg ]) if iteration % ((self.max_iterations // 10) or 1) == 0: logging.debug((" Iteration {} of {}: loss={:.3g} " + "l2={:.3g} f={:.3g}").format( iteration, self.max_iterations, l, np.mean(l2s), np.mean(scores))) # check if we should abort search if we're getting nowhere. if self.abort_early and \ iteration % ((self.max_iterations // 10) or 1) == 0: if l > prev * .9999: msg = " Failed to make progress; stop early" logging.debug(msg) break prev = l # adjust the best result found so far for e, (l2, sc, ii) in enumerate(zip(l2s, scores, nimg)): lab = np.argmax(batchlab[e]) if l2 < bestl2[e] and compare(sc, lab): bestl2[e] = l2 bestscore[e] = np.argmax(sc) if l2 < o_bestl2[e] and compare(sc, lab): o_bestl2[e] = l2 o_bestscore[e] = np.argmax(sc) o_bestattack[e] = ii # adjust the constant as needed for e in range(batch_size): if compare(bestscore[e], np.argmax(batchlab[e])) and \ bestscore[e] != -1: # success, divide const by two upper_bound[e] = min(upper_bound[e], CONST[e]) if upper_bound[e] < 1e9: CONST[e] = (lower_bound[e] + upper_bound[e]) / 2 else: # failure, either multiply by 10 if no solution found yet # or do binary search with the known upper bound lower_bound[e] = max(lower_bound[e], CONST[e]) if upper_bound[e] < 1e9: CONST[e] = (lower_bound[e] + upper_bound[e]) / 2 else: CONST[e] *= 10 logging.debug(" Successfully generated adversarial examples " + "on {} of {} instances.".format( sum(upper_bound < 1e9), batch_size)) o_bestl2 = np.array(o_bestl2) mean = np.mean(np.sqrt(o_bestl2[o_bestl2 < 1e9])) logging.debug(" Mean successful distortion: {:.4g}".format(mean)) # return the best solution found logging.info(" Successfully generated adversarial examples " + "on {} of {} instances.".format( sum(upper_bound < 1e9), batch_size)) o_bestl2 = np.array(o_bestl2) mean = np.mean(np.sqrt(o_bestl2[o_bestl2 < 1e9])) logging.info(" Mean successful distortion: {:.4g}".format(mean)) return o_bestattack
def compute_prior_prob(data_pipeline, smooth_para=1.0): """ Compute prior probabilities for future use in ml-knn. :param data_pipeline: :param smooth_para: :return: (total number of labels per label, total number of videos processed, prior probabilities) """ reader = data_pipeline.reader num_classes = reader.num_classes with tf.Graph().as_default() as g: sum_labels_onehot = tf.Variable(tf.zeros([num_classes])) total_num_videos = tf.Variable(0, dtype=tf.float32) # Generate example queue. Traverse the queue to traverse the data set. video_id_batch, video_batch, video_labels_batch, num_frames_batch = get_input_data_tensors( data_pipeline, num_epochs=1, name_scope='prior_prob_input') sum_labels_onehot_op = sum_labels_onehot.assign_add( tf.reduce_sum(tf.cast(video_labels_batch, tf.float32), axis=0)) accum_num_videos_op = total_num_videos.assign_add( tf.cast(tf.shape(video_labels_batch)[0], tf.float32)) with tf.control_dependencies( [sum_labels_onehot_op, accum_num_videos_op]): accum_non_op = tf.no_op() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session(graph=g) as sess: sess.run(init_op) # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: while not coord.should_stop(): # sum video labels sess.run(accum_non_op) except tf.errors.OutOfRangeError: logging.info('Done the whole data set.') finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sum_labels_val, total_num_videos_val = sess.run( [sum_labels_onehot, total_num_videos]) sess.close() labels_prior_prob_val = (smooth_para + sum_labels_val) / ( smooth_para * 2 + total_num_videos_val) logging.debug('sum_labels_val: {}\n accum_num_videos_val: {}'.format( sum_labels_val, total_num_videos_val)) logging.debug('compute_labels_prob: {}'.format(labels_prior_prob_val)) return sum_labels_val, total_num_videos_val, labels_prior_prob_val
def transform(self, test_data_pipeline, out_file_location, top_k=20): test_graph = tf.Graph() with test_graph.as_default(): video_id_batch, video_batch, labels_batch, num_frames_batch = ( get_input_data_tensors(test_data_pipeline, shuffle=False, num_epochs=1, name_scope='test_input')) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Run test graph to get video batch and feed video batch to pre_trained_graph to get predictions. test_sess = tf.Session(graph=test_graph) with gfile.Open(out_file_location, "w+") as out_file: test_sess.run(init_op) # Be cautious to not be blocked by queue. # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=test_sess, coord=coord) processing_count, num_examples_processed = 0, 0 out_file.write("VideoId,LabelConfidencePairs\n") try: while not coord.should_stop(): # Run training steps or whatever. start_time = time.time() video_id_batch_val, video_batch_val = test_sess.run([video_id_batch, video_batch]) logging.debug('video_id_batch_val: {}\nvideo_batch_val: {}'.format( video_id_batch_val, video_batch_val)) batch_predictions_prob_list = [] for sess, video_input_batch, pred_prob, phase_train_pl in zip( self.sess_list, self.video_input_batch_list, self.pred_prob_list, self.phase_train_pl_list): feature_shape = video_input_batch.get_shape()[-1] # logging.info('Feature shape is {}.'.format(feature_shape)) if feature_shape == 128: _video_batch = video_batch_val[:, -128:] elif feature_shape == 1024: _video_batch = video_batch_val[:, :1024] else: _video_batch = video_batch_val batch_predictions_prob = sess.run(pred_prob, feed_dict=dict( {video_input_batch: _video_batch}, **phase_train_pl )) batch_predictions_prob_list.append(batch_predictions_prob) batch_predictions_mean_prob = np.mean(np.stack(batch_predictions_prob_list, axis=0), axis=0) # Write batch predictions to files. for line in format_lines(video_id_batch_val, batch_predictions_mean_prob, top_k): out_file.write(line) out_file.flush() now = time.time() processing_count += 1 num_examples_processed += video_id_batch_val.shape[0] print('Batch processing step {}, elapsed {} s, processed {} examples in total'.format( processing_count, now - start_time, num_examples_processed)) except tf.errors.OutOfRangeError: logging.info('Done with inference. The predictions were written to {}'.format(out_file_location)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) test_sess.close() out_file.close()
def fit(self, train_data_pipeline, start_new_model=False, tr_data_fn=None, tr_data_paras=None, validate_set=None, validate_fn=None, bootstrap=False, init_learning_rate=0.01, decay_steps=40000, decay_rate=0.95, epochs=None, l1_reg_rate=None, l2_reg_rate=0.01, pos_weights=None, initial_weights=None, initial_biases=None): """ Logistic regression fit function. Args: train_data_pipeline: A namedtuple consisting of reader, data_pattern, batch_size and num_readers. start_new_model: If True, start a new model instead of restoring from existing checkpoints. tr_data_fn: a function that transforms input data. tr_data_paras: Other parameters should be passed to tr_data_fn. A dictionary. validate_set: If not None, check validation loss regularly. Else, ignored. validate_fn: The function to check the performance of learned model parameters on validate set. bootstrap: If True, sampling training examples with replacement by differential weighting. init_learning_rate: Decayed gradient descent parameter. decay_steps: Decayed gradient descent parameter. decay_rate: Decayed gradient descent parameter. epochs: Maximal epochs to use. l1_reg_rate: None, not impose l1 regularization. l2_reg_rate: l2 regularization rate. pos_weights: For imbalanced binary classes. Here, num_pos << num_neg, the weights should be > 1.0. If None, treated as 1.0 for all binary classifiers. initial_weights: If not None, the weights will be initialized with it. initial_biases: If not None, the biases will be initialized with it. Returns: None. """ reader = train_data_pipeline.reader batch_size = train_data_pipeline.batch_size num_classes = reader.num_classes feature_names = reader.feature_names feature_sizes = reader.feature_sizes logging.info( 'Logistic regression uses {} features with dims {}.'.format( feature_names, feature_sizes)) raw_feature_size = sum(feature_sizes) self.train_data_pipeline = train_data_pipeline self.raw_feature_size = raw_feature_size self.feature_size = raw_feature_size self.num_classes = num_classes self.batch_size = batch_size self.tr_data_fn = tr_data_fn self.tr_data_paras = tr_data_paras self.bootstrap = bootstrap self.init_learning_rate = init_learning_rate self.decay_steps = decay_steps self.decay_rate = decay_rate self.epochs = epochs self.l1_reg_rate = l1_reg_rate self.l2_reg_rate = l2_reg_rate self.pos_weights = pos_weights self.initial_weights = initial_weights self.initial_biases = initial_biases # Check extra data transform function arguments. # If transform changes the features size, change it. if self.tr_data_fn is not None: if self.tr_data_paras is None: self.tr_data_paras = dict() else: if ('reshape' in self.tr_data_paras) and ( self.tr_data_paras['reshape'] is True): self.feature_size = self.tr_data_paras['size'] logging.warn( 'Data transform changes the features size to {}.'. format(self.feature_size)) logging.debug('Data transform arguments are {}.'.format( self.tr_data_paras)) else: self.tr_data_paras = dict() start_new_model = start_new_model or (not tf.gfile.Exists(self.logdir)) # This is NECESSARY to avoid contaminating default graph. # Alternatively, we can define a member graph variable. When building a new graph or # restoring a graph, wrap the code into a similar contextmanager. self.graph = tf.Graph() with self.graph.as_default(): if start_new_model: logging.info('Starting a new model...') # Start new model, delete existing checkpoints. if tf.gfile.Exists(self.logdir): try: tf.gfile.DeleteRecursively(self.logdir) except tf.errors.OpError: logging.error('Failed to delete dir {}.'.format( self.logdir)) else: logging.info( 'Succeeded to delete train dir {}.'.format( self.logdir)) else: # Do nothing. pass # Build graph, namely building a graph and initialize member variables associated with graph. self.saver = self._build_graph() else: self.saver = self._restore_graph() # After either building a graph or restoring a graph, graph is CONSTRUCTED successfully. # Get collections to be used in training. self.global_step = tf.get_collection('global_step')[0] self.init_op = tf.get_collection('init_op')[0] self.train_op = tf.get_collection('train_op')[0] self.summary_op = tf.get_collection('summary_op')[0] self.raw_features_batch = tf.get_collection( 'raw_features_batch')[0] self.labels_batch = tf.get_collection('labels_batch')[0] self.loss = tf.get_collection('loss')[0] self.pred_prob = tf.get_collection('predictions')[0] if self._check_graph_initialized(): logging.info('Succeeded to initialize logistic regression Graph.') else: logging.error('Failed to initialize logistic regression Graph.') # Start or restore training. # To avoid summary causing memory usage peak, manually save summaries. sv = tf.train.Supervisor(graph=self.graph, init_op=self.init_op, logdir=self.logdir, global_step=self.global_step, summary_op=None, save_model_secs=600, saver=self.saver) with sv.managed_session() as sess: logging.info("Entering training loop...") for step in range(self.max_train_steps): if sv.should_stop(): # Save the final model and break. self.saver.save(sess, save_path='{}_{}'.format( sv.save_path, 'final')) break if step % 500 == 0: if validate_fn is not None: _, summary, train_pred_prob_batch, train_labels_batch, global_step_val = sess.run( [ self.train_op, self.summary_op, self.pred_prob, self.labels_batch, self.global_step ]) # Evaluate on train data. train_per = validate_fn( predictions=train_pred_prob_batch, labels=train_labels_batch) sv.summary_writer.add_summary( MakeSummary( 'train/{}'.format(validate_fn.func_name), train_per), global_step_val) logging.info('Step {}, train {}: {}.'.format( global_step_val, validate_fn.func_name, train_per)) else: _, summary, global_step_val = sess.run( [self.train_op, self.summary_op, self.global_step]) # Add train summary. sv.summary_computed(sess, summary, global_step=global_step_val) # Compute validate loss and performance (validate_fn). if validate_set is not None: validate_data, validate_labels = validate_set # Compute validation loss. num_validate_videos = validate_data.shape[0] split_indices = np.linspace( 0, num_validate_videos + 1, num=max( num_validate_videos // (2 * batch_size) + 1, 2), dtype=np.int32) validate_loss_vals, predictions = [], [] for i in range(len(split_indices) - 1): start_ind = split_indices[i] end_ind = split_indices[i + 1] if validate_fn is not None: ith_validate_loss_val, ith_predictions = sess.run( [self.loss, self.pred_prob], feed_dict={ self.raw_features_batch: validate_data[start_ind:end_ind], self.labels_batch: validate_labels[start_ind:end_ind] }) validate_loss_vals.append( ith_validate_loss_val * (end_ind - start_ind)) predictions.append(ith_predictions) else: ith_validate_loss_val = sess.run( self.loss, feed_dict={ self.raw_features_batch: validate_data[start_ind:end_ind], self.labels_batch: validate_labels[start_ind:end_ind] }) validate_loss_vals.append( ith_validate_loss_val * (end_ind - start_ind)) validate_loss_val = sum( validate_loss_vals) / num_validate_videos # Add validate summary. sv.summary_writer.add_summary( MakeSummary('validate/xentropy', validate_loss_val), global_step_val) if validate_fn is not None: validate_per = validate_fn( predictions=np.concatenate(predictions, axis=0), labels=validate_labels) sv.summary_writer.add_summary( MakeSummary( 'validate/{}'.format( validate_fn.func_name), validate_per), global_step_val) logging.info('Step {}, validate {}: {}.'.format( global_step_val, validate_fn.func_name, validate_per)) elif step % 200 == 0: _, summary, global_step_val = sess.run( [self.train_op, self.summary_op, self.global_step]) sv.summary_computed(sess, summary, global_step=global_step_val) else: sess.run(self.train_op) logging.info("Exited training loop.") # Session will close automatically when with clause exits. # sess.close() sv.stop()
def fit(self, max_iter=100, tol=0.01): """ This function works as sk-learn estimator fit. :param max_iter: :param tol: Percentage not improved one iteration, stop iteration. :return: Update current centers and current objective function value (member variables). """ for iter_count in range(max_iter): start_time = time.time() new_centers, new_mean_dist, new_per_clu_mean_dist = self.kmeans_iter( ) print('The {}-th iteration took {} s.'.format( iter_count + 1, time.time() - start_time)) # There are empty centers (clusters) being removed. need_rebuild_graph = new_centers.shape[ 0] != self.current_centers.shape[0] # Update current centers and mean distance per cluster. # Normalize current centers if distance metric is cosine. if self.metric == 'cosine': self.current_centers = new_centers / np.clip( np.linalg.norm(new_centers, axis=-1, keepdims=True), 1e-6, np.PINF) else: self.current_centers = new_centers self.per_clu_mean_dist = new_per_clu_mean_dist # Converged, break! if not np.isinf(self.mean_dist) and np.abs( self.mean_dist - new_mean_dist) / self.mean_dist < tol: # Update current objective function value. self.mean_dist = new_mean_dist logging.info( 'Done k-means clustering. Final centers have shape {}. Final mean dist is {}.' .format(self.current_centers.shape, self.mean_dist)) break else: # Update current objective function value. self.mean_dist = new_mean_dist if need_rebuild_graph: # Re-build graph using updated current centers. self.build_iter_graph() initialize_success = self.check_graph_initialized() if initialize_success: logging.info( 'Succeeded re-initializing a Tensorflow graph to perform k-means.' ) else: raise ValueError( 'Failed to re-initialize a Tensorflow Graph to perform k-means.' ) logging.debug('new_centers: {}'.format(self.current_centers)) logging.info('new_centers shape: {}'.format( self.current_centers.shape)) logging.info('New mean point-center distance: {}'.format( self.mean_dist))
tf.keras.datasets.cifar10.load_data() img_width, img_height, img_channels = 32, 32, 3 label_dimensions = 10 train_images = np.asarray(train_images, dtype=np.float32) / 255 test_images = np.asarray(test_images, dtype=np.float32) / 255 train_images_mean = np.mean(train_images, axis=0) train_images -= train_images_mean test_images -= train_images_mean train_images = train_images.reshape((-1, img_width, img_height, img_channels)) test_images = test_images.reshape((-1, img_width, img_height, img_channels)) debug("shape train_images %s" % (train_images.shape, )) debug("shape train_labels %s" % (train_labels.shape, )) debug("shape test_images %s" % (test_images.shape, )) debug("shape test_labels %s" % (test_labels.shape, )) train_labels = tf.keras.utils.to_categorical(train_labels, label_dimensions) test_labels = tf.keras.utils.to_categorical(test_labels, label_dimensions) train_labels = train_labels.astype(np.float32) test_labels = test_labels.astype(np.float32) train_labels = np.asarray(train_labels).astype('int').reshape( (-1, label_dimensions)) test_labels = np.asarray(test_labels).astype('int').reshape( (-1, label_dimensions))
import tempfile def fetch_hdfs_data(paths, data_msg_q, retry_times=3, data_dir=None): msg = data_msg_q.get_msg() if data_dir and not os.path.isdir(data_dir): try: data_dir = tempfile.mkdtemp(prefix='data_', suffix='_tmp', dir='./') except Exception, e: logging.error(e) data_dir = './' while True: logging.debug("receive msg: " + msg) if msg == 'reset': for data_path in paths: filename = os.path.split(data_path)[1] filepath = os.path.join(data_dir, filename) if os.path.exists(filepath): logging.debug(filename + " all readly exist in local") data_msg_q.put_data(filepath) logging.debug("reuse local data " + filepath + " done") continue count, ret_code = 0, -1 command = ["hadoop", "fs", "-get", data_path, data_dir] while count < retry_times and ret_code != 0: try: ret_code = subprocess.check_call(command) except subprocess.CalledProcessError, e:
def attack(self, imgs): """ Return a tensor that constructs adversarial examples for the given input. Generate uses tf.py_func in order to operate over tensors. :param x: A tensor with the inputs. :param kwargs: See `parse_params` """ imgs = tf.cast(imgs, tf.float32) preds = self.fn_logits(imgs) preds_max = tf.reduce_max(preds, 1, keepdims=True) original_predictions = tf.to_float(tf.equal(preds, preds_max)) labs = tf.stop_gradient(original_predictions) repeat = self.binary_search_steps >= 10 shape = tf.shape(imgs) # # the variable we're going to optimize over # modifier = tfe.Variable(tf.zeros(shape, dtype=tf_dtype)) def compute_newimage(imgs, modifier): # the resulting instance, tanh'd to keep bounded from clip_min # to clip_max newimg = (tf.tanh(modifier + imgs) + 1) / 2 newimg = newimg * (self.clip_max - self.clip_min) + self.clip_min return newimg def get_l2dist(imgs, newimg): # distance to the input data other = (tf.tanh(imgs) + 1) / 2 * (self.clip_max - self.clip_min) + self.clip_min sum_axis = list(range(1, len(shape.numpy()))) l2dist = tf.reduce_sum(tf.square(newimg - other), sum_axis) return l2dist def loss(timg, tlab, const, modifier): newimg = compute_newimage(timg, modifier) # prediction BEFORE-SOFTMAX of the model if self.sample <= 1: output = self.fn_logits(newimg) else: logging.info( "Monte Carlo (MC) on attacks, sample: {}".format(self.sample)) for i in range(self.sample): logits = self.fn_logits(newimg) if i == 0: assert logits.op.type != 'Softmax' output.append(logits) output = tf.reduct_mean(output, 0) # distantce to the input data l2dist = get_l2dist(timg, newimg) # compute the probability of the label class versus the maximum other real_target = tf.reduce_sum((tlab) * output, 1) other_target = tf.reduce_max((1 - tlab) * output - tlab * 10000, 1) zero = tf.constant(0., dtype=tf_dtype) if self.y_target: # if targeted, optimize for making the other class most likely loss1 = tf.maximum(zero, other_target - real_target + self.confidence) else: # if untargeted, optimize for making this class least likely. loss1 = tf.maximum(zero, real_target - other_target + self.confidence) # sum up the losses loss2 = tf.reduce_sum(l2dist) loss1 = tf.reduce_sum(const * loss1) loss = loss1 + loss2 return loss, output def grad(imgs, labs, const, modifier): with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(modifier) loss_value, logits = loss(imgs, labs, const, modifier) with tape.stop_recording(): gradients = tape.gradient(loss_value, [modifier]) return gradients, loss_value, logits def compare_multi(x, y): x_array = tf.unstack(x) if self.y_target: x_array[y] = x_array[y] - self.confidence else: x_array[y] = x_array[y] + self.confidence x = tf.argmax(tf.stack(x_array)) if self.y_target: return x == y else: return x != y def compare_single(x, y): if self.y_target: return x == y else: return x != y # batch_size = tf.shape(imgs)[0] batch_size = imgs.get_shape().as_list()[0] # re-scale instances to be within range [0, 1] imgs = (imgs - self.clip_min) / (self.clip_max - self.clip_min) imgs = tf.clip_by_value(imgs, 0, 1) # now convert to [-1, 1] imgs = (imgs * 2) - 1 # convert to tanh-space imgs = tf.atanh(imgs * .999999) # set the lower and upper bounds accordingly lower_bound = tfe.Variable(tf.zeros(batch_size), trainable=False) const = tfe.Variable(tf.ones(batch_size) * self.initial_const, trainable=False) upper_bound = tfe.Variable(tf.ones(batch_size) * 1e10, trainable=False) # placeholders for the best l2, score, and instance attack found so far o_bestl2 = tfe.Variable(tf.constant(1e10, shape=(batch_size, )), trainable=False) o_bestscore = tfe.Variable(tf.constant(-1, shape=(batch_size, )), trainable=False) o_bestattack = tfe.Variable(tf.identity(imgs), trainable=False) for outer_step in range(self.binary_search_steps): # completely reset adam's internal state. modifier = tfe.Variable(tf.zeros(shape, dtype=tf_dtype)) optimizer = tf.train.AdamOptimizer(self.learning_rate) bestl2 = tfe.Variable(tf.constant(1e10, shape=(batch_size, )), trainable=False) bestscore = tfe.Variable(tf.constant(-1, shape=(batch_size, )), trainable=False) logging.info(" Binary search step %s of %s", outer_step, self.binary_search_steps) # The last iteration (if we run many steps) repeat the search once. if repeat and outer_step == self.binary_search_steps - 1: const = upper_bound prev = 1e6 for iteration in range(self.max_iterations): import resource, gc mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logging.info('resource {}'.format(mem)) gc.collect() tf.set_random_seed(np.random.randint(0, 100)) # perform the attack gradients, loss_value, scores = grad(imgs, labs, const, modifier) optimizer.apply_gradients(zip(gradients, [modifier])) nimg = compute_newimage(imgs, modifier) l2s = get_l2dist(imgs, nimg) if iteration % ((self.max_iterations // 10) or 1) == 0 and \ logging.get_verbosity() == logging.DEBUG: l2_mean = tf.reduce_mean(l2s).numpy() logging.debug( " Iteration {} of {}: loss={:.3g} l2={:.3g}".format( iteration, self.max_iterations, loss_value, l2_mean)) # check if we should abort search if we're getting nowhere. if self.abort_early and \ iteration % ((self.max_iterations // 10) or 1) == 0: if loss_value > prev * .9999: logging.debug(" Failed to make progress; stop early" ) break prev = loss_value # adjust the best result found so far for e, (l2, sc, ii) in enumerate(zip(l2s, scores, nimg)): lab = tf.argmax(labs[e]) comp = compare_multi(sc, lab) if l2 < bestl2[e] and comp: bestl2[e].assign(l2) bestscore[e].assign(tf.argmax(sc, output_type=tf.int32)) if l2 < o_bestl2[e] and comp: o_bestl2[e].assign(l2) o_bestscore[e].assign(tf.argmax(sc, output_type=tf.int32)) o_bestattack[e].assign(ii) # adjust the constant as needed for e in range(batch_size): if compare_single(bestscore[e], tf.argmax(labs[e])) and bestscore[e] != -1: # success, divide const by two upper_bound[e].assign(tf.minimum(upper_bound[e], const[e])) if upper_bound[e] < 1e9: const[e].assign((lower_bound[e] + upper_bound[e]) / 2) else: # failure, either multiply by 10 if no solution found yet # or do binary search with the known upper bound lower_bound[e].assign(tf.maximum(lower_bound[e], const[e])) if upper_bound[e] < 1e9: const[e].assign((lower_bound[e] + upper_bound[e]) / 2) else: const[e].assign(const[e]*10) if logging.get_verbosity() == logging.DEBUG: success = tf.cast(tf.less(upper_bound, 1e9), tf.int32) logging.debug(" Successfully generated adversarial examples " + "on {} of {} instances.".format( tf.reduce_sum(success), batch_size)) mask = tf.less(o_bestl2, 1e9) mean = tf.reduce_mean(tf.sqrt(tf.boolean_mask(o_bestl2, mask))) logging.debug(" Mean successful distortion: {:.4g}".format(mean.numpy())) # return the best solution found success = tf.cast(tf.less(upper_bound, 1e9), tf.int32) logging.info(" Successfully generated adversarial examples " + "on {} of {} instances.".format( tf.reduce_sum(success), batch_size)) mask = tf.less(o_bestl2, 1e9) mean = tf.reduce_mean(tf.sqrt(tf.boolean_mask(o_bestl2, mask))) logging.info(" Mean successful distortion: {:.4g}".format(mean.numpy())) return o_bestattack.read_value()
def make_predictions(self, test_data_pipeline, output_file_loc, top_k=20): """ Make predictions. :param test_data_pipeline :param output_file_loc: The file to which predictions should be written to. Supports gcloud file. :param top_k: See FLAGS.top_k. """ with tf.Graph().as_default() as g: video_id_batch, video_batch, video_labels_batch, num_frames_batch = get_input_data_tensors( test_data_pipeline, num_epochs=1, name_scope='test_input') init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session(graph=g) as sess, gfile.Open(output_file_loc, "w+") as out_file: sess.run(init_op) # Be cautious to not be blocked by queue. # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) processing_count, num_examples_processed = 0, 0 out_file.write("VideoId,LabelConfidencePairs\n") try: while not coord.should_stop(): # Run training steps or whatever. start_time = time.time() video_id_batch_val, video_batch_val = sess.run( [video_id_batch, video_batch]) logging.debug( 'video_id_batch_val: {}\nvideo_batch_val: {}'.format( video_id_batch_val, video_batch_val)) # Pass values instead of tensors. batch_predictions_prob = self.make_batch_predictions( video_id_batch_val, video_batch_val) # Write batch predictions to files. for line in format_lines(video_id_batch_val, batch_predictions_prob, top_k): out_file.write(line) out_file.flush() now = time.time() processing_count += 1 num_examples_processed += video_id_batch_val.shape[0] print( 'Batch processing step {}, elapsed {} seconds, processed {} examples in total' .format(processing_count, now - start_time, num_examples_processed)) except tf.errors.OutOfRangeError: logging.info( 'Done with inference. The predictions were written to {}'. format(output_file_loc)) finally: # When done, ask the threads to stop. coord.request_stop() # Wait for threads to finish. coord.join(threads) sess.close() out_file.close()
def prepare_serialized_examples(self, serialized_examples): logging.set_verbosity(tf.logging.DEBUG) # hardcoded values len_features_frames = 1024 len_features_audio = 128 name_frames = "mean_rgb" name_audio = "mean_audio" # set the mapping from the fields to data types in the proto num_features = len(self.feature_names) assert num_features > 0, "self.feature_names is empty!" assert len(self.feature_names) == len(self.feature_sizes), \ "length of feature_names (={}) != length of feature_sizes (={})".format( \ len(self.feature_names), len(self.feature_sizes)) feature_map = { "video_id": tf.FixedLenFeature([], tf.string), "labels": tf.VarLenFeature(tf.int64) } logging.debug("self.random_selection es " + str(self.random_selection)) zeros_float = tf.zeros([tf.shape(serialized_examples)[0]]) # Manera cutre de crear un vector de False. Alguna altra manera ha d'haver-hi is_negative = tf.not_equal(zeros_float, zeros_float) for feature_index in range(num_features): feature_map[ self.feature_names[feature_index]] = tf.FixedLenFeature( [self.feature_sizes[feature_index]], tf.float32) features = tf.parse_example(serialized_examples, features=feature_map) features_rgb = features[name_frames] features_audio = features[name_audio] labels_audio = tf.sparse_to_indicator(features["labels"], self.num_classes) batch_size = tf.shape(features[name_frames])[0] if self.negative_sampling: labels = tf.sparse_to_indicator(features["labels"], self.num_classes) labels.set_shape([None, self.num_classes]) def return_itself(a, b): return a, b # 80% of the samples are negative number_neg_sample = tf.random_uniform( [], minval=0., maxval=1., dtype=tf.float32, name="random_number_neg_sample") constant = tf.constant(self.percentage_negative) batch_size = tf.shape(features_rgb)[0] logging.info("-----------------") logging.info(batch_size) is_negative = tf.random_uniform([batch_size, 1], minval=0, maxval=1) is_negative = tf.less(is_negative, constant) features_audio_return, labels_audio = self.sample_negatively( features, labels, is_negative) concatenated_features = tf.concat( [features_rgb, features_audio_return], 1) else: # Normal case, leave as it was # We can use python comparisons because they are checked only when creating the graph if self.random_selection == 0 | (self.random_selection == 1 & num_features > 1): for feature_index in range(num_features): feature_map[self.feature_names[ feature_index]] = tf.FixedLenFeature( [self.feature_sizes[feature_index]], tf.float32) features = tf.parse_example(serialized_examples, features=feature_map) labels = tf.sparse_to_indicator(features["labels"], self.num_classes) labels.set_shape([None, self.num_classes]) labels_audio = labels concatenated_features = tf.concat([ features[feature_name] for feature_name in self.feature_names ], 1) # Evaluation with only one of the two features elif self.random_selection == 1: feature_map[name_frames] = tf.FixedLenFeature( [len_features_frames], tf.float32) feature_map[name_audio] = tf.FixedLenFeature( [len_features_audio], tf.float32) features = tf.parse_example(serialized_examples, features=feature_map) labels = tf.sparse_to_indicator(features["labels"], self.num_classes) labels.set_shape([None, self.num_classes]) # In this point there is only 1 feature_name # We can use python comparisons because they are checked only when creating the graph if self.feature_names[0] == name_frames: concatenated_features = tf.concat([ features[name_frames], tf.zeros_like(features[name_audio]) ], 1) else: concatenated_features = tf.concat([ tf.zeros_like(features[name_frames]), features[name_audio] ], 1) # Training with thirds else: feature_map[name_frames] = tf.FixedLenFeature( [len_features_frames], tf.float32) feature_map[name_audio] = tf.FixedLenFeature( [len_features_audio], tf.float32) features = tf.parse_example(serialized_examples, features=feature_map) labels = tf.sparse_to_indicator(features["labels"], self.num_classes) labels.set_shape([None, self.num_classes]) number = tf.random_uniform([], minval=0., maxval=3., dtype=tf.float32, name="random_number") features_rgb = features[name_frames] features_audio = features[name_audio] one = tf.constant(1.) two = tf.constant(2.) features_audio = tf.cond( tf.less(number, one), lambda: tf.clip_by_value(features_audio, 0, 0), lambda: features_audio) features_rgb = tf.cond( tf.greater(number, two), lambda: tf.clip_by_value(features_rgb, 0, 0), lambda: features_rgb) concatenated_features = tf.concat( [features_rgb, features_audio], 1, name="concat_features") return features["video_id"], concatenated_features, labels, tf.ones( [tf.shape(serialized_examples)[0]]), is_negative, labels_audio
def main(unused_argv): """ Training. init_learning_rate: Initial learning rate. decay_steps: How many training steps to decay learning rate once. decay_rate: How much to decay learning rate. l2_reg_rate: l2 regularization rate. epochs: The maximal epochs to pass all training data. """ logging.set_verbosity(logging.INFO) output_dir = FLAGS.output_dir start_new_model = FLAGS.start_new_model init_learning_rate = FLAGS.init_learning_rate decay_steps = FLAGS.decay_steps decay_rate = FLAGS.decay_rate l2_reg_rate = FLAGS.l2_reg_rate train_epochs = FLAGS.train_epochs model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes reader = get_reader(model_type, feature_names, feature_sizes) train_data_pattern = FLAGS.train_data_pattern validate_data_pattern = FLAGS.validate_data_pattern batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers init_with_linear_clf = FLAGS.init_with_linear_clf is_bootstrap = FLAGS.is_bootstrap # Increase num_readers. validate_data_pipeline = DataPipeline(reader=reader, data_pattern=validate_data_pattern, batch_size=batch_size, num_readers=num_readers) if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')): with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f: validate_data = pickle.load(f) with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f: validate_labels = pickle.load(f) else: # Sample validate set for line search in linear classifier or logistic regression early stopping. _, validate_data, validate_labels, _ = random_sample( 0.05, mask=(False, True, True, False), data_pipeline=validate_data_pipeline) with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f: pickle.dump(validate_data, f) with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f: pickle.dump(validate_labels, f) start_new_model = start_new_model or (not tf.gfile.Exists(output_dir)) # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers. try: # Load sum_labels in training set, numpy float format to compute pos_weights. train_sum_labels = load_sum_labels() # num_neg / num_pos, assuming neg_weights === 1.0. pos_weights = np.sqrt( (float(NUM_TRAIN_EXAMPLES) - train_sum_labels) / train_sum_labels) logging.info( 'Computing pos_weights based on sum_labels in train set successfully.' ) except IOError: logging.error('Cannot load train sum_labels. Use default value.') pos_weights = None finally: logging.error('Disable pos_weights.') # Set it as None to disable pos_weights. pos_weights = None train_data_pipeline = DataPipeline(reader=reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) if start_new_model: # Load train data mean and std. train_features_mean, train_features_var = load_features_mean_var( reader) tr_data_fn = standard_scale tr_data_paras = { 'mean': train_features_mean, 'variance': train_features_var, 'reshape': False, 'size': None } if init_with_linear_clf: # ...Start linear classifier... # Compute weights and biases of linear classifier using normal equation. # Linear search helps little. linear_clf = LinearClassifier( logdir=path_join(output_dir, 'linear_classifier')) linear_clf.fit(data_pipeline=train_data_pipeline, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras, l2_regs=[ 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0 ], validate_set=(validate_data, validate_labels), line_search=True) linear_clf_weights, linear_clf_biases = linear_clf.weights, linear_clf.biases logging.info( 'linear classifier weights and biases with shape {}, {}'. format(linear_clf_weights.shape, linear_clf_biases.shape)) logging.debug( 'linear classifier weights and {} biases: {}.'.format( linear_clf_weights, linear_clf_biases)) # ...Exit linear classifier... else: linear_clf_weights, linear_clf_biases = None, None else: linear_clf_weights, linear_clf_biases = None, None tr_data_fn = None tr_data_paras = None # Run logistic regression. log_reg = LogisticRegression(logdir=path_join(output_dir, 'log_reg')) log_reg.fit(train_data_pipeline, start_new_model=start_new_model, tr_data_fn=tr_data_fn, tr_data_paras=tr_data_paras, validate_set=(validate_data, validate_labels), validate_fn=gap_fn, bootstrap=is_bootstrap, init_learning_rate=init_learning_rate, decay_steps=decay_steps, decay_rate=decay_rate, epochs=train_epochs, l2_reg_rate=l2_reg_rate, pos_weights=pos_weights, initial_weights=linear_clf_weights, initial_biases=linear_clf_biases)
def initialize(num_centers_ratio, data_pipeline, method=None, metric='cosine', max_iter=100, tol=0.005, scaling_method=1, alpha=1.0, p=3): """ This functions initializes representative prototypes (RBF centers) c and scaling factors sigma. This function will generate one group of centers for all labels as a whole. Be cautious with initialize_per_label. Args: num_centers_ratio: The number of centers to be decided / total number of examples that belong to label l, for l = 0, ..., num_classes - 1. data_pipeline: A namedtuple consisting of the following elements. reader, video-level features reader or frame-level features reader. data_pattern, File Glob of data set. batch_size, How many examples to handle per time. num_readers, How many IO threads to prefetch examples. method: The method to decide the centers. Possible choices are random selection, kmeans and online (kmeans). Default is None, which represents randomly selecting a certain number of examples as centers. metric: Distance metric, euclidean distance or cosine distance. max_iter: The maximal number of iterations clustering to be done. tol: The minimal reduction of objective function of clustering to be reached to stop iteration. scaling_method: There are four choices. 1, all of them use the same sigma, the p smallest pairs of distances. 2, average of p nearest centers. 3, distance to the nearest center that has a different label (Not supported!). 4, mean distance between this center and all of its points. alpha: The alpha parameter that should be set heuristically. It works like a learning rate. (mu in Zhang) p: When scaling_method is 1 or 2, p is needed. Returns: centers (prototypes) and scaling factors (sigmas). Raises: ValueError if num_centers_ratio is not between 0.0 (open) and 1.0 (closed). ValueError if metric is not euclidean or cosine. ValueError if method is not one of None, kmeans or online. NotImplementedError if scaling_method is 3 or 5. ValueError if scaling method is not 1 - 5. """ logging.info('Generate a group of centers for all labels. See Schwenker.') # Argument checking. if (num_centers_ratio <= 0.0) or (num_centers_ratio > 1.0): raise ValueError( 'num_centers_ratio must be larger than 0.0 and no greater than 1.0.' ) logging.info('num_centers_ratio is {}.'.format(num_centers_ratio)) if ('euclidean' == metric) or ('cosine' == metric): logging.info( 'Using {} distance. The larger, the less similar.'.format(metric)) else: raise ValueError( 'Only euclidean and cosine distance are supported, {} passed.'. format(metric)) # Sample features only. _, centers, _, _ = random_sample(num_centers_ratio, mask=(False, True, False, False), data_pipeline=data_pipeline, name_scope='sample_centers') logging.info('Sampled {} centers totally.'.format(centers.shape[0])) logging.debug('Randomly selected centers: {}'.format(centers)) # Used in scaling method 4. Average distance of each point with its cluster center. per_clu_mean_dist = None # Perform k-means or online k-means. if method is None: logging.info( 'Using randomly selected centers as model prototypes (centers).') elif 'online' == method: raise NotImplementedError( 'Only None (randomly select examples), online, kmeans are supported.' ) elif 'kmeans' == method: logging.info( 'Using k-means clustering result as model prototypes (centers).') return_mean_clu_dist = (scaling_method == 4) kmeans = KMeans(centers, data_pipeline=data_pipeline, metric=metric, return_mean_clu_dist=return_mean_clu_dist) kmeans.fit(max_iter=max_iter, tol=tol) # Get current centers and update centers. centers = kmeans.current_centers per_clu_mean_dist = kmeans.per_clu_mean_dist else: raise ValueError( 'Only None (randomly select examples), online, kmeans are supported.' ) # Compute scaling factors based on these centers. num_centers = centers.shape[0] sigmas = None if scaling_method == 1: # Equation 27. pairwise_distances = sci_distance.pdist(centers, metric=metric) p = min(p, len(pairwise_distances)) logging.info('Using {} minimal pairwise distances.'.format(p)) # np.partition second argument begins with 0. sigmas = np.array( [alpha * np.mean(np.partition(pairwise_distances, p - 1)[:p])] * num_centers, dtype=np.float32) elif scaling_method == 2: # Equation 28. p = min(p, num_centers - 1) logging.info('Using {} minimal distances per center.'.format(p)) if 'euclidean' == metric: dis_fn = sci_distance.euclidean else: dis_fn = sci_distance.cosine sigmas = [] for c in centers: distances = [dis_fn(c, _c) for _c in centers] # The distance between c and itself is zero and is in the left partition. sigmas.append(alpha * np.sum(np.partition(distances, p)[:p + 1]) / float(p)) sigmas = np.array(sigmas, dtype=np.float32) elif scaling_method == 3: # Equation 29. raise NotImplementedError( 'Not supported when all labels use the same centers.') elif scaling_method == 4: # Equation 30. if per_clu_mean_dist is None: kmeans = KMeans(centers, data_pipeline=data_pipeline, metric=metric, return_mean_clu_dist=True) kmeans.fit(max_iter=1, tol=tol) centers = kmeans.current_centers per_clu_mean_dist = kmeans.per_clu_mean_dist logging.info( 'Compute mean distance per cluster using kmeans or online kmeans.' ) else: logging.info( 'Reuse mean distance per cluster computed in kmeans or online kmeans.' ) sigmas = alpha * per_clu_mean_dist elif scaling_method == 5: # Equation 31. raise NotImplementedError( 'Only three methods are supported. Please read the documentation.') else: raise ValueError( 'Only three methods are supported. Please read the documentation.') logging.debug('Scaling factor sigmas: {}'.format(sigmas)) return centers, sigmas