def revise_sentiment(annot, mapping=None): """Revises sentiment annotations. """ if mapping is None: mapping = _default_mapping logging.warn('Use default mapping.') words = [_stemmer.stem(x.strip(',.')) for x in annot.lower().split()] words = [mapping[x] for x in words if x in mapping] return words
def evaluate(results, groundtruths): """Evaluate the statement ranking task. Args: results: a dict mapping from image_id to 15 float numbers denoting distances. Returns: metrics: a dict mapping from metric name to score, involving: accuracy: the ratio of correct top-1 prediction. rank-med: the medain value of the groundtruths' ranks. rank-avg: the average value of the groundtruths' ranks. rank-min: the minimum value of the groundtruths' ranks. """ if len(results) != len(groundtruths): logging.warn('size of gts: %i, size of res: %i', len(groundtruths), len(results)) all_accuracy, all_recall_at_3 = [], [] all_rank_min, all_rank_avg, all_rank_med = [], [], [] for image_id, result in results.iteritems(): assert image_id in groundtruths distances = result['distances'] pos_examples = groundtruths[image_id]['pos_examples'] all_examples = groundtruths[image_id]['all_examples'] distances = np.array(distances) ranking_r = distances.argsort() ranking = np.array(ranking_r) for i, rank in enumerate(ranking_r): ranking[rank] = i positions = ranking[[ all_examples.index(example) for example in pos_examples ]] positions = np.sort(positions) all_accuracy.append(positions[0] == 0) all_recall_at_3.append(sum([1 for pos in positions if pos < 3])) all_rank_min.append(1 + positions[0]) all_rank_avg.append(1 + np.mean(positions)) all_rank_med.append(1 + np.median(positions)) mean_func = lambda x: round(np.array(x).astype(np.float).mean(), 4) eval_results = { 'accuracy': mean_func(all_accuracy), 'recall_at_3': mean_func(all_recall_at_3), 'rank_min': mean_func(all_rank_min), 'rank_avg': mean_func(all_rank_avg), 'rank_med': mean_func(all_rank_med), } return eval_results
def __init__(self, vocab_file, max_size): """Creates a vocab of up to max_size words, reading from the vocab_file. If max_size is 0, reads the entire vocab file. Args: vocab_file: path to the vocab file, which is assumed to contain "<word> <frequency>" on each line, sorted with most frequent word first. This code doesn't actually use the frequencies, though. max_size: integer. The maximum size of the resulting Vocabulary.""" self._word_to_id = {} self._id_to_word = {} self._count = 0 # keeps track of total number of words in the Vocab # [UNK], [PAD], [START] and [STOP] get the ids 0,1,2,3. for w in [UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]: self._word_to_id[w] = self._count self._id_to_word[self._count] = w self._count += 1 # Read the vocab file and add words up to max_size with file_io.FileIO(vocab_file, 'r') as vocab_f: for line in vocab_f: pieces = line.split() if len(pieces) != 2: log.warn( 'Warning: incorrectly formatted line in vocabulary file: %s\n' % line) continue w = pieces[0] if w in [ SENTENCE_START, SENTENCE_END, UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING ]: raise Exception( '<s>, </s>, [UNK], [PAD], [START] and [STOP] shouldn\'t be in the vocab file, but %s is' % w) if w in self._word_to_id: raise Exception('Duplicated word in vocabulary file: %s' % w) self._word_to_id[w] = self._count self._id_to_word[self._count] = w self._count += 1 if max_size != 0 and self._count >= max_size: log.info( "max_size of vocab was specified as %i; we now have %i words. Stopping reading." % (max_size, self._count)) break log.info( "Finished constructing vocabulary of %i total words. Last word added: %s" % (self._count, self._id_to_word[self._count - 1]))
def __run_eval(model, data_dir, coverage, conf, batch_size): checkpoint_dir = os.path.join( conf.model_dir, 'eval') # make a subdir of the root dir for eval data if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) # this is where checkpoints of best models are saved bestmodel_save_path = os.path.join(checkpoint_dir, 'bestmodel') best_loss = None step = 0 seen_steps = set() do_eval = True with model.build_graph().as_default(): ds = etl.dataset(data_dir, batch_size) iterator = ds.make_initializable_iterator() saver = tf.train.Saver( max_to_keep=3) # we will keep 3 best checkpoints at a time summary_writer = tf.summary.FileWriter(checkpoint_dir) with tf.Session(config=conf.session_config) as sess: # run eval at least once and until all checkpoints are evaluated while do_eval: # load a new checkpoint from training util.load_ckpt(saver, sess, log_root=conf.model_dir) running_avg_loss = 0 # init new epoch sess.run(iterator.initializer) next_batch = iterator.get_next() batch_count = 0 t0 = time.time() try: while True: batch_t0 = time.time() results = model.run_eval_step(sess, next_batch) batch_t1 = time.time() batch_count += 1 step = results['global_step'] if step in seen_steps: do_eval = False # this checkpoint has already been evaluated, do not save it. running_avg_loss = 9999 break loss = results['loss'] if not np.isfinite(loss): log.warn('loss is nan. Skip batch {}'.format( batch_count)) continue summaries = results['summaries'] summary_writer.add_summary(summaries, step) # calculate running avg loss running_avg_loss = calc_running_avg_loss( np.asscalar(loss), running_avg_loss, summary_writer, step) msg = 'eval step={}, batch={}, ra_loss={:.4f}, loss={:.4f}, secs={}'.format( step, batch_count, running_avg_loss, loss, int(batch_t1 - batch_t0)) if coverage: coverage_loss = results['coverage_loss'] msg += ", coverage_loss={:.4f}".format( coverage_loss) log.info(msg) # flush the summary writer every so often if batch_count % 10 == 0: summary_writer.flush() except tf.errors.OutOfRangeError: seen_steps.add(step) t1 = time.time() mins = int((t1 - t0) / 60) log.info('eval end of epoch, mins={}'.format(mins)) finally: summary_writer.flush() # If running_avg_loss is best so far, save this checkpoint (early stopping). # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir if best_loss is None or running_avg_loss < best_loss: best_loss = running_avg_loss log.info( 'eval Found new best model with %.4f running_avg_loss. Saving %s', best_loss, bestmodel_save_path) saver.save(sess, bestmodel_save_path, global_step=step, latest_filename='checkpoint_best') log.info('eval done')
def run(args): # collect preprocessed data input_files = [args.questions, args.non_questions] for p in input_files: assert os.path.exists(p) if not os.path.exists(args.output_file_name) or args.overwrite: questions, non_questions = gather_stack_exchange_from_file( *input_files) glove_vocab, glove_vectors, glove_word2index = process_glove( glove_file=args.glove_path) (word2index, index2word, vectors, training_vocab) = set_data_to_new_vocab(questions, non_questions, glove_vectors, glove_word2index) if args.static_data: features, labels = create_static_features( questions, non_questions, word2index, size=args.num_samples, max_seq_length=args.max_seq_length, max_num_questions=args.max_num_questions, max_num_elements=args.max_elements, randomize_num_questions=args.rand_num_questions) data = StaticDataContainer(questions, non_questions, training_vocab, vectors, word2index, index2word, features, labels, args.num_samples, args.mini_num_samples, args.test_fraction, args.max_seq_length, args.max_elements, args.description) with open(args.output_file_name + '.static.pkl', 'wb+') as container: pickle.dump(data, container, protocol=-1) if args.gen_data: data = GeneratorDataContainer( questions, non_questions, training_vocab, vectors, word2index, index2word, args.max_seq_length, args.max_elements, args.max_num_questions, args.description, test_size=int(args.test_fraction * 10000), randomize_num_questions=args.rand_num_questions) with open(args.output_file_name + '.generator.pkl', 'wb+') as container: pickle.dump(data, container, protocol=-1) else: logging.warn( 'pickle files already exist, set overwrite to true if you want to refresh data; its randomly created each time' ) sys.exit() return data
parser.add_argument('-z', '--from-scratch', action='store_true') # noqa parser.add_argument('--static-data', action='store_true') parser.add_argument('--gen-data', action='store_true') parser.add_argument('-c', '--description', type=str, default=None) args = parser.parse_args() assert args.num_samples > args.mini_num_samples, "You must select more samples than mini_num_samples" assert args.static_data or args.gen_data, 'Need to choose static or generator type dataset' if args.static_data: assert int( args.num_samples - (args.num_samples * args.test_fraction) ) > args.mini_num_samples, "Set test fraction lower or set num samples higher" if os.path.exists(args.output_file_name) and not args.overwrite: logging.warn( ' mini pickle files already exist, set overwrite to true if you want to refresh data; its randomly created each time' ) sys.exit() if args.from_scratch: directory_str = os.path.join('.', 'stack_exchange', STACKS[2]) filename = 'Comments.xml' _, _ = gather_stack_exchange_data_from_scratch(directory_str, filename, XMLS_DICT, write=True) data_obj = run(args) if args.supress_meta: data_obj.display_metadata()
def main(unused_argv): logging.set_verbosity(logging.INFO) start_new_model = FLAGS.start_new_model output_dir = FLAGS.output_dir init_learning_rate = FLAGS.init_learning_rate decay_steps = FLAGS.decay_steps decay_rate = FLAGS.decay_rate l1_reg_rate = FLAGS.l1_reg_rate l2_reg_rate = FLAGS.l2_reg_rate is_bootstrap = FLAGS.is_bootstrap train_epochs = FLAGS.train_epochs model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes reader = get_reader(model_type, feature_names, feature_sizes) train_data_pattern = FLAGS.train_data_pattern validate_data_pattern = FLAGS.validate_data_pattern batch_size = FLAGS.batch_size num_readers = FLAGS.num_readers if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')): with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f: validate_data = pickle.load(f) with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f: validate_labels = pickle.load(f) else: # Increase num_readers. validate_data_pipeline = DataPipeline( reader=reader, data_pattern=validate_data_pattern, batch_size=batch_size, num_readers=num_readers) # Sample validate set. _, validate_data, validate_labels, _ = random_sample( 0.05, mask=(False, True, True, False), data_pipeline=validate_data_pipeline, name_scope='sample_validate') with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f: pickle.dump(validate_data, f) with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f: pickle.dump(validate_labels, f) train_data_pipeline = DataPipeline(reader=reader, data_pattern=train_data_pattern, batch_size=batch_size, num_readers=num_readers) model_save_path = path_join(output_dir, 'mlp_fuse') if start_new_model and tf.gfile.Exists(model_save_path): logging.info('Starting a new model...') # Start new model, delete existing checkpoints. try: tf.gfile.DeleteRecursively(model_save_path) except tf.errors.OpError: logging.error('Failed to delete dir {}.'.format(model_save_path)) else: logging.info( 'Succeeded to delete train dir {}.'.format(model_save_path)) # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers. try: # Load sum_labels in training set, numpy float format to compute pos_weights. train_sum_labels = load_sum_labels() # num_neg / num_pos, assuming neg_weights === 1.0. pos_weights = np.sqrt( float(NUM_TRAIN_EXAMPLES) / train_sum_labels - 1.0) logging.info( 'Computing pos_weights based on sum_labels in train set successfully.' ) except IOError: logging.error('Cannot load train sum_labels. Use default value.') pos_weights = None finally: logging.warn('Not to use positive weights.') pos_weights = None train(train_data_pipeline, epochs=train_epochs, pos_weights=pos_weights, l1_reg_rate=l1_reg_rate, l2_reg_rate=l2_reg_rate, init_learning_rate=init_learning_rate, bootstrap=is_bootstrap, validate_set=(validate_data, validate_labels), validate_fn=gap_fn, logdir=model_save_path)
def fit(self, train_data_pipeline, start_new_model=False, tr_data_fn=None, tr_data_paras=None, validate_set=None, validate_fn=None, bootstrap=False, init_learning_rate=0.01, decay_steps=40000, decay_rate=0.95, epochs=None, l1_reg_rate=None, l2_reg_rate=0.01, pos_weights=None, initial_weights=None, initial_biases=None): """ Logistic regression fit function. Args: train_data_pipeline: A namedtuple consisting of reader, data_pattern, batch_size and num_readers. start_new_model: If True, start a new model instead of restoring from existing checkpoints. tr_data_fn: a function that transforms input data. tr_data_paras: Other parameters should be passed to tr_data_fn. A dictionary. validate_set: If not None, check validation loss regularly. Else, ignored. validate_fn: The function to check the performance of learned model parameters on validate set. bootstrap: If True, sampling training examples with replacement by differential weighting. init_learning_rate: Decayed gradient descent parameter. decay_steps: Decayed gradient descent parameter. decay_rate: Decayed gradient descent parameter. epochs: Maximal epochs to use. l1_reg_rate: None, not impose l1 regularization. l2_reg_rate: l2 regularization rate. pos_weights: For imbalanced binary classes. Here, num_pos << num_neg, the weights should be > 1.0. If None, treated as 1.0 for all binary classifiers. initial_weights: If not None, the weights will be initialized with it. initial_biases: If not None, the biases will be initialized with it. Returns: None. """ reader = train_data_pipeline.reader batch_size = train_data_pipeline.batch_size num_classes = reader.num_classes feature_names = reader.feature_names feature_sizes = reader.feature_sizes logging.info( 'Logistic regression uses {} features with dims {}.'.format( feature_names, feature_sizes)) raw_feature_size = sum(feature_sizes) self.train_data_pipeline = train_data_pipeline self.raw_feature_size = raw_feature_size self.feature_size = raw_feature_size self.num_classes = num_classes self.batch_size = batch_size self.tr_data_fn = tr_data_fn self.tr_data_paras = tr_data_paras self.bootstrap = bootstrap self.init_learning_rate = init_learning_rate self.decay_steps = decay_steps self.decay_rate = decay_rate self.epochs = epochs self.l1_reg_rate = l1_reg_rate self.l2_reg_rate = l2_reg_rate self.pos_weights = pos_weights self.initial_weights = initial_weights self.initial_biases = initial_biases # Check extra data transform function arguments. # If transform changes the features size, change it. if self.tr_data_fn is not None: if self.tr_data_paras is None: self.tr_data_paras = dict() else: if ('reshape' in self.tr_data_paras) and ( self.tr_data_paras['reshape'] is True): self.feature_size = self.tr_data_paras['size'] logging.warn( 'Data transform changes the features size to {}.'. format(self.feature_size)) logging.debug('Data transform arguments are {}.'.format( self.tr_data_paras)) else: self.tr_data_paras = dict() start_new_model = start_new_model or (not tf.gfile.Exists(self.logdir)) # This is NECESSARY to avoid contaminating default graph. # Alternatively, we can define a member graph variable. When building a new graph or # restoring a graph, wrap the code into a similar contextmanager. self.graph = tf.Graph() with self.graph.as_default(): if start_new_model: logging.info('Starting a new model...') # Start new model, delete existing checkpoints. if tf.gfile.Exists(self.logdir): try: tf.gfile.DeleteRecursively(self.logdir) except tf.errors.OpError: logging.error('Failed to delete dir {}.'.format( self.logdir)) else: logging.info( 'Succeeded to delete train dir {}.'.format( self.logdir)) else: # Do nothing. pass # Build graph, namely building a graph and initialize member variables associated with graph. self.saver = self._build_graph() else: self.saver = self._restore_graph() # After either building a graph or restoring a graph, graph is CONSTRUCTED successfully. # Get collections to be used in training. self.global_step = tf.get_collection('global_step')[0] self.init_op = tf.get_collection('init_op')[0] self.train_op = tf.get_collection('train_op')[0] self.summary_op = tf.get_collection('summary_op')[0] self.raw_features_batch = tf.get_collection( 'raw_features_batch')[0] self.labels_batch = tf.get_collection('labels_batch')[0] self.loss = tf.get_collection('loss')[0] self.pred_prob = tf.get_collection('predictions')[0] if self._check_graph_initialized(): logging.info('Succeeded to initialize logistic regression Graph.') else: logging.error('Failed to initialize logistic regression Graph.') # Start or restore training. # To avoid summary causing memory usage peak, manually save summaries. sv = tf.train.Supervisor(graph=self.graph, init_op=self.init_op, logdir=self.logdir, global_step=self.global_step, summary_op=None, save_model_secs=600, saver=self.saver) with sv.managed_session() as sess: logging.info("Entering training loop...") for step in range(self.max_train_steps): if sv.should_stop(): # Save the final model and break. self.saver.save(sess, save_path='{}_{}'.format( sv.save_path, 'final')) break if step % 500 == 0: if validate_fn is not None: _, summary, train_pred_prob_batch, train_labels_batch, global_step_val = sess.run( [ self.train_op, self.summary_op, self.pred_prob, self.labels_batch, self.global_step ]) # Evaluate on train data. train_per = validate_fn( predictions=train_pred_prob_batch, labels=train_labels_batch) sv.summary_writer.add_summary( MakeSummary( 'train/{}'.format(validate_fn.func_name), train_per), global_step_val) logging.info('Step {}, train {}: {}.'.format( global_step_val, validate_fn.func_name, train_per)) else: _, summary, global_step_val = sess.run( [self.train_op, self.summary_op, self.global_step]) # Add train summary. sv.summary_computed(sess, summary, global_step=global_step_val) # Compute validate loss and performance (validate_fn). if validate_set is not None: validate_data, validate_labels = validate_set # Compute validation loss. num_validate_videos = validate_data.shape[0] split_indices = np.linspace( 0, num_validate_videos + 1, num=max( num_validate_videos // (2 * batch_size) + 1, 2), dtype=np.int32) validate_loss_vals, predictions = [], [] for i in range(len(split_indices) - 1): start_ind = split_indices[i] end_ind = split_indices[i + 1] if validate_fn is not None: ith_validate_loss_val, ith_predictions = sess.run( [self.loss, self.pred_prob], feed_dict={ self.raw_features_batch: validate_data[start_ind:end_ind], self.labels_batch: validate_labels[start_ind:end_ind] }) validate_loss_vals.append( ith_validate_loss_val * (end_ind - start_ind)) predictions.append(ith_predictions) else: ith_validate_loss_val = sess.run( self.loss, feed_dict={ self.raw_features_batch: validate_data[start_ind:end_ind], self.labels_batch: validate_labels[start_ind:end_ind] }) validate_loss_vals.append( ith_validate_loss_val * (end_ind - start_ind)) validate_loss_val = sum( validate_loss_vals) / num_validate_videos # Add validate summary. sv.summary_writer.add_summary( MakeSummary('validate/xentropy', validate_loss_val), global_step_val) if validate_fn is not None: validate_per = validate_fn( predictions=np.concatenate(predictions, axis=0), labels=validate_labels) sv.summary_writer.add_summary( MakeSummary( 'validate/{}'.format( validate_fn.func_name), validate_per), global_step_val) logging.info('Step {}, validate {}: {}.'.format( global_step_val, validate_fn.func_name, validate_per)) elif step % 200 == 0: _, summary, global_step_val = sess.run( [self.train_op, self.summary_op, self.global_step]) sv.summary_computed(sess, summary, global_step=global_step_val) else: sess.run(self.train_op) logging.info("Exited training loop.") # Session will close automatically when with clause exits. # sess.close() sv.stop()
def fit(self, data_pipeline=None, tr_data_fn=None, tr_data_paras=None, l2_regs=None, validate_set=None, line_search=True): """ Compute weights and biases of linear classifier using normal equation. With line search for best l2_reg. Args: data_pipeline: A namedtuple consisting of the following elements. reader, video-level features reader or frame-level features reader. data_pattern, File Glob of data set. batch_size, How many examples to handle per time. num_readers, How many IO threads to prefetch examples. tr_data_fn: a function that transforms input data. tr_data_paras: Other parameters should be passed to tr_data_fn. A dictionary. l2_regs: An array, each element represents how much the linear classifier weights should be penalized. validate_set: (data, labels) with dtype float32. The data set (numpy arrays) used to choose the best l2_reg. Sampled from whole validate set if necessary. If line_search is False, this argument is simply ignored. line_search: Boolean argument representing whether to do boolean search. Returns: Weights and biases fit on the given data set, where biases are appended as the last row. """ logging.info('Entering linear classifier ...') batch_size = data_pipeline.batch_size reader = data_pipeline.reader num_classes = reader.num_classes feature_names = reader.feature_names feature_sizes = reader.feature_sizes raw_feature_size = sum(feature_sizes) feature_size = raw_feature_size logging.info('Linear regression uses {} features with dims {}.'.format( feature_names, feature_sizes)) if line_search: # Both l2_regs and validate_set are required. if l2_regs is None: raise ValueError('There is no l2_regs to do line search.') else: logging.info('l2_regs is {}.'.format(l2_regs)) if validate_set is None: raise ValueError( 'There is no validate_set to do line search for l2_reg.') else: validate_data, validate_labels = validate_set else: # Simply fit the training set. Make l2_regs have only one element. And ignore validate_set. if l2_regs is None: l2_regs = [0.001] elif isinstance(l2_regs, float): l2_regs = [l2_regs] elif isinstance(l2_regs, list) or isinstance(l2_regs, tuple): l2_regs = l2_regs[:1] logging.info('No line search, l2_regs is {}.'.format(l2_regs)) if validate_set is None: # Important! To make the graph construction successful. validate_data = np.zeros([1, raw_feature_size], dtype=np.float32) validate_labels = np.zeros([1, num_classes], dtype=np.float32) else: validate_data, validate_labels = validate_set # Check validate data and labels shape. logging.info( 'validate set: data has shape {}, labels has shape {}.'.format( validate_data.shape, validate_labels.shape)) if (validate_data.shape[-1] != raw_feature_size) or ( validate_labels.shape[-1] != num_classes): raise ValueError( 'validate set shape does not conforms with training set.') # TO BE CAUTIOUS! THE FOLLOWING MAY HAVE TO DEAL WITH FEATURE SIZE CHANGE. # Check extra data transform function arguments. # If transform changes the features size, change it. if tr_data_fn is not None: if tr_data_paras is None: tr_data_paras = {} else: if ('reshape' in tr_data_paras) and (tr_data_paras['reshape'] is True): feature_size = tr_data_paras['size'] logging.warn( 'Data transform changes the features size to {}.'. format(feature_size)) # Method - append an all-one col to X by using block matrix multiplication (all-one col is treated as a block). # Create the graph to traverse all data once. with tf.Graph().as_default() as graph: global_step = tf.Variable(initial_value=0, trainable=False, dtype=tf.int32, name='global_step') global_step_inc_op = tf.assign_add(global_step, 1) # X.transpose * X norm_equ_1_initializer = tf.placeholder( tf.float32, shape=[feature_size, feature_size]) norm_equ_1 = tf.Variable(initial_value=norm_equ_1_initializer, collections=[], name='X_Tr_X') # X.transpose * Y norm_equ_2_initializer = tf.placeholder( tf.float32, shape=[feature_size, num_classes]) norm_equ_2 = tf.Variable(initial_value=norm_equ_2_initializer, collections=[], name='X_Tr_Y') example_count = tf.Variable(initial_value=0.0, name='example_count') features_sum = tf.Variable(initial_value=tf.zeros([feature_size]), name='features_sum') labels_sum = tf.Variable(initial_value=tf.zeros([num_classes]), name='labels_sum') id_batch, raw_features_batch, labels_batch, num_frames_batch = ( get_input_data_tensors(data_pipeline, num_epochs=1, name_scope='input')) if tr_data_fn is None: transformed_features_batch = tf.identity(raw_features_batch) else: transformed_features_batch = tr_data_fn( raw_features_batch, **tr_data_paras) with tf.name_scope('batch_increment'): transformed_features_batch_tr = tf.matrix_transpose( transformed_features_batch, name='X_Tr') float_labels_batch = tf.cast(labels_batch, tf.float32) batch_norm_equ_1 = tf.matmul(transformed_features_batch_tr, transformed_features_batch, name='batch_norm_equ_1') # batch_norm_equ_1 = tf.add_n(tf.map_fn(lambda x: tf.einsum('i,j->ij', x, x), # transformed_features_batch_tr), name='X_Tr_X') batch_norm_equ_2 = tf.matmul(transformed_features_batch_tr, float_labels_batch, name='batch_norm_equ_2') batch_example_count = tf.cast( tf.shape(transformed_features_batch)[0], tf.float32, name='batch_example_count') batch_features_sum = tf.reduce_sum(transformed_features_batch, axis=0, name='batch_features_sum') batch_labels_sum = tf.reduce_sum(float_labels_batch, axis=0, name='batch_labels_sum') with tf.name_scope('update_ops'): update_norm_equ_1_op = tf.assign_add(norm_equ_1, batch_norm_equ_1) update_norm_equ_2_op = tf.assign_add(norm_equ_2, batch_norm_equ_2) update_example_count = tf.assign_add(example_count, batch_example_count) update_features_sum = tf.assign_add(features_sum, batch_features_sum) update_labels_sum = tf.assign_add(labels_sum, batch_labels_sum) with tf.control_dependencies([ update_norm_equ_1_op, update_norm_equ_2_op, update_example_count, update_features_sum, update_labels_sum, global_step_inc_op ]): update_equ_non_op = tf.no_op(name='unified_update_op') with tf.name_scope('solution'): # After all data being handled, compute weights. l2_reg_ph = tf.placeholder(tf.float32, shape=[]) l2_reg_term = tf.diag(tf.fill([feature_size], l2_reg_ph), name='l2_reg') # X.transpose * X + lambda * Id, where d is the feature dimension. norm_equ_1_with_reg = tf.add(norm_equ_1, l2_reg_term) # Concat other blocks to form the final norm equation terms. final_norm_equ_1_top = tf.concat( [norm_equ_1_with_reg, tf.expand_dims(features_sum, 1)], 1) final_norm_equ_1_bot = tf.concat( [features_sum, tf.expand_dims(example_count, 0)], 0) final_norm_equ_1 = tf.concat([ final_norm_equ_1_top, tf.expand_dims(final_norm_equ_1_bot, 0) ], 0, name='norm_equ_1') final_norm_equ_2 = tf.concat( [norm_equ_2, tf.expand_dims(labels_sum, 0)], 0, name='norm_equ_2') # The last row is the biases. weights_biases = tf.matrix_solve(final_norm_equ_1, final_norm_equ_2, name='weights_biases') weights = weights_biases[:-1] biases = weights_biases[-1] with tf.name_scope('validate_loss'): validate_x_pl = tf.placeholder(tf.float32, shape=[None, raw_feature_size], name='validate_data') validate_y_pl = tf.placeholder(tf.float32, shape=[None, num_classes], name='validate_labels') if tr_data_fn is None: validate_x_transformed = tf.identity(validate_x_pl) else: validate_x_transformed = tr_data_fn(validate_x_pl, reuse=True, **tr_data_paras) predictions = tf.matmul(validate_x_transformed, weights) + biases loss = tf.sqrt(tf.reduce_mean( tf.squared_difference(predictions, validate_y_pl)), name='rmse') # pred_labels = tf.greater_equal(predictions, 0.0, name='pred_labels') summary_op = tf.summary.merge_all() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), name='init_glo_loc_var') sess = tf.Session(graph=graph) # Initialize variables. sess.run(init_op) sess.run( [norm_equ_1.initializer, norm_equ_2.initializer], feed_dict={ norm_equ_1_initializer: np.zeros([feature_size, feature_size], dtype=np.float32), norm_equ_2_initializer: np.zeros([feature_size, num_classes], dtype=np.float32) }) summary_writer = tf.summary.FileWriter(self.logdir, graph=sess.graph) # Start input enqueue threads. coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: while not coord.should_stop(): _, summary, global_step_val = sess.run( [update_equ_non_op, summary_op, global_step]) summary_writer.add_summary(summary, global_step=global_step_val) except tf.errors.OutOfRangeError: logging.info( 'Finished normal equation terms computation -- one epoch done.' ) finally: # When done, ask the threads to stop. coord.request_stop() summary_writer.close() # Wait for threads to finish. coord.join(threads) # Do line search. best_weights_val, best_biases_val = None, None best_l2_reg = 0 min_loss = np.PINF for l2_reg in l2_regs: # Compute regularized weights. weights_val, biases_val = sess.run([weights, biases], feed_dict={l2_reg_ph: l2_reg}) # Compute validation loss. num_validate_videos = validate_data.shape[0] split_indices = np.linspace( 0, num_validate_videos + 1, num=max(num_validate_videos // batch_size + 1, 2), dtype=np.int32) loss_vals = [] for i in range(len(split_indices) - 1): start_ind = split_indices[i] end_ind = split_indices[i + 1] # Avoid re-computing weights and biases (Otherwise, l2_reg_ph is necessary). ith_loss_val = sess.run(loss, feed_dict={ validate_x_pl: validate_data[start_ind:end_ind], validate_y_pl: validate_labels[start_ind:end_ind], weights: weights_val, biases: biases_val }) loss_vals.append(ith_loss_val * (end_ind - start_ind)) validate_loss_val = sum(loss_vals) / num_validate_videos logging.info('l2_reg {} leads to rmse loss {}.'.format( l2_reg, validate_loss_val)) if validate_loss_val < min_loss: best_weights_val, best_biases_val = weights_val, biases_val min_loss = validate_loss_val best_l2_reg = l2_reg sess.close() if (not line_search) and (validate_set is None): min_loss = None logging.info('The best l2_reg is {} with rmse loss {}.'.format( best_l2_reg, min_loss)) logging.info('Exiting linear classifier ...') self.weights = best_weights_val self.biases = best_biases_val self.rmse = min_loss
def _get_data_placeholders(config, split): """Returns data placeholder to feed the dataset. Args: config: an instance of ads_mem_examples_pb2.AdsMemExamples. Returns: data_placeholders: a dict mapping from name to placeholders. feed_dict: a dict mapping from name to data. """ # Create placeholders. data_placeholders = { 'image_id': tf.placeholder(tf.string, [None]), 'img_features': tf.placeholder(tf.float32, [None, config.feature_dimensions]), 'roi_features': tf.placeholder( tf.float32, [None, config.number_of_regions, config.feature_dimensions]), 'number_of_statements': tf.placeholder(tf.int32, [None]), 'statement_strings': tf.placeholder( tf.int32, [None, config.max_stmts_per_image, config.max_stmt_len]), 'statement_lengths': tf.placeholder(tf.int32, [None, config.max_stmts_per_image]), 'number_of_symbols': tf.placeholder(tf.int32, [None]), 'symbols': tf.placeholder(tf.int32, [None, config.max_symbols_per_image]), } if not config.use_single_densecap: data_placeholders.update({ 'number_of_densecaps': tf.placeholder(tf.int32, [None]), 'densecap_strings': tf.placeholder(tf.int32, [ None, config.max_densecaps_per_image, config.max_densecap_len ]), 'densecap_lengths': tf.placeholder(tf.int32, [None, config.max_densecaps_per_image]), }) else: data_placeholders.update({ 'number_of_densecaps': tf.placeholder(tf.int32, [None]), 'densecap_strings': tf.placeholder(tf.int32, [ None, 1, config.max_densecaps_per_image * config.max_densecap_len ]), 'densecap_lengths': tf.placeholder(tf.int32, [None, 1]), }) if split != 'train': data_placeholders.update({ 'eval_statement_strings': tf.placeholder(tf.int32, [ None, config.number_of_val_stmts_per_image, config.max_stmt_len ]), 'eval_statement_lengths': tf.placeholder(tf.int32, [None, config.number_of_val_stmts_per_image]), }) # Load annotations and image features. assert tf.gfile.Exists(config.image_feature_path) assert tf.gfile.Exists(config.region_feature_path) assert tf.gfile.Exists(config.statement_vocab_path) assert tf.gfile.Exists(config.statement_annot_path) assert tf.gfile.Exists(config.densecap_vocab_path) assert tf.gfile.Exists(config.densecap_annot_path) assert tf.gfile.Exists(config.symbol_annot_path) assert tf.gfile.Exists(config.symbol_cluster_path) # Image features. start = time.time() image_features = np.load(config.image_feature_path).item() region_features = np.load(config.region_feature_path).item() logging.info( 'Image features are loaded, cost=%is, img_len=%i, roi_len=%i.', time.time() - start, len(image_features), len(region_features)) # Action-reason annotations. start = time.time() stmt_annots = load_action_reason_annots(config.statement_annot_path) logging.info('Annotations are loaded, cost=%is, path=%s, len=%i.', time.time() - start, config.statement_annot_path, len(stmt_annots)) stmt_vocab = load_vocab(config.statement_vocab_path) logging.info('Load vocab from %s, vocab_size=%i', config.statement_vocab_path, len(stmt_vocab)) # Densecap annotations. start = time.time() dense_annots = load_densecap_annots(config.densecap_annot_path, config.max_densecaps_per_image) logging.info('Dense annotations are loaded, cost=%is, path=%s, len=%i.', time.time() - start, config.densecap_annot_path, len(dense_annots)) dense_vocab = load_vocab(config.densecap_vocab_path) logging.info('Load vocab from %s, vocab_size=%i', config.densecap_vocab_path, len(dense_vocab)) # Symbol annotations. start = time.time() symbol_annots = load_raw_annots(config.symbol_annot_path) logging.info('Symbol annotations are loaded, cost=%is, path=%s, len=%i.', time.time() - start, config.symbol_annot_path, len(symbol_annots)) word_to_id, id_to_symbol = load_symbol_cluster(config.symbol_cluster_path) # Initialize feed_dict. feed_dict = { 'image_id': [], 'img_features': [], 'roi_features': [], 'number_of_statements': [], 'statement_strings': [], 'statement_lengths': [], 'number_of_densecaps': [], 'densecap_strings': [], 'densecap_lengths': [], 'number_of_symbols': [], 'symbols': [], } if split != 'train': feed_dict.update({ 'eval_statement_strings': [], 'eval_statement_lengths': [], }) total_images = total_statements = 0 # Split training data for validation purpose. stmt_annots = stmt_annots.items() if split == 'valid': stmt_annots = stmt_annots[:config.number_of_val_examples] elif split == 'train': stmt_annots = stmt_annots[config.number_of_val_examples:] logging.info('Processing %i %s records...', len(stmt_annots), split) if config.debug: logging.warn('DEBUG MODE!!!!!!!') stmt_annots = stmt_annots[:100] for index, (image_id, annot) in enumerate(stmt_annots): # Pad action-reason. (number_of_statements, statement_strings, statement_lengths) = encode_and_pad_sentences( stmt_vocab, annot['pos_examples'], config.max_stmts_per_image, config.max_stmt_len) # Pad densecap. if not config.use_single_densecap: (number_of_densecaps, densecap_strings, densecap_lengths) = encode_and_pad_sentences( dense_vocab, dense_annots[image_id], config.max_densecaps_per_image, config.max_densecap_len) else: # Concatenate all densecaps to form a single sentence. dense_string_concat = ' '.join(dense_annots[image_id]) (number_of_densecaps, densecap_strings, densecap_lengths) = encode_and_pad_sentences( dense_vocab, [dense_string_concat], 1, config.max_densecap_len * config.max_densecaps_per_image) # Pad symbols. symbols = symbol_annots.get(image_id, []) number_of_symbols = len(symbols) symbols += [0] * config.max_symbols_per_image symbols = symbols[:config.max_symbols_per_image] feed_dict['image_id'].append(image_id) feed_dict['img_features'].append(image_features[image_id]) feed_dict['roi_features'].append(region_features[image_id]) feed_dict['number_of_statements'].append( np.array(number_of_statements, dtype=np.int32)) feed_dict['statement_strings'].append(statement_strings) feed_dict['statement_lengths'].append(statement_lengths) feed_dict['number_of_densecaps'].append( np.array(number_of_densecaps, dtype=np.int32)) feed_dict['densecap_strings'].append(densecap_strings) feed_dict['densecap_lengths'].append(densecap_lengths) feed_dict['number_of_symbols'].append( np.array(number_of_symbols, dtype=np.int32)) feed_dict['symbols'].append(np.array(symbols)) if split != 'train': # Pad strings for evaluation purpose. (number_of_eval_statements, eval_statement_strings, eval_statement_lengths) = encode_and_pad_sentences( stmt_vocab, annot['all_examples'], config.number_of_val_stmts_per_image, config.max_stmt_len) assert number_of_eval_statements == config.number_of_val_stmts_per_image feed_dict['eval_statement_strings'].append(eval_statement_strings) feed_dict['eval_statement_lengths'].append(eval_statement_lengths) total_images += 1 total_statements += number_of_statements if index % 1000 == 0: logging.info('Load on %i/%i', index, len(stmt_annots)) logging.info('Load %i images with %i statements.', total_images, total_statements) # Legacy: GPU or CPU mode. if config.data_provider_mode == ads_mem_examples_pb2.AdsMemExamples.FROM_CPU: for k, v in feed_dict.items(): feed_dict[data_placeholders[k]] = np.stack(v) del feed_dict[k] return data_placeholders, feed_dict # elif config.data_provider_mode == ads_mem_examples_pb2.AdsMemExamples.FROM_GPU: # data_tensors = {} # for k, v in feed_dict.items(): # data_tensors[k] = tf.constant(np.stack(v)) # return data_tensors, {} raise ValueError('Unknown mode %i.' % config.data_provider_mode)