Esempio n. 1
0
def revise_sentiment(annot, mapping=None):
    """Revises sentiment annotations.
  """
    if mapping is None:
        mapping = _default_mapping
        logging.warn('Use default mapping.')

    words = [_stemmer.stem(x.strip(',.')) for x in annot.lower().split()]
    words = [mapping[x] for x in words if x in mapping]
    return words
def evaluate(results, groundtruths):
    """Evaluate the statement ranking task.

  Args:
    results: a dict mapping from image_id to 15 float numbers denoting distances.

  Returns:
    metrics: a dict mapping from metric name to score, involving:
      accuracy: the ratio of correct top-1 prediction.
      rank-med: the medain value of the groundtruths' ranks.
      rank-avg: the average value of the groundtruths' ranks.
      rank-min: the minimum value of the groundtruths' ranks.
  """
    if len(results) != len(groundtruths):
        logging.warn('size of gts: %i, size of res: %i', len(groundtruths),
                     len(results))

    all_accuracy, all_recall_at_3 = [], []
    all_rank_min, all_rank_avg, all_rank_med = [], [], []

    for image_id, result in results.iteritems():
        assert image_id in groundtruths
        distances = result['distances']

        pos_examples = groundtruths[image_id]['pos_examples']
        all_examples = groundtruths[image_id]['all_examples']

        distances = np.array(distances)

        ranking_r = distances.argsort()
        ranking = np.array(ranking_r)
        for i, rank in enumerate(ranking_r):
            ranking[rank] = i

        positions = ranking[[
            all_examples.index(example) for example in pos_examples
        ]]
        positions = np.sort(positions)

        all_accuracy.append(positions[0] == 0)
        all_recall_at_3.append(sum([1 for pos in positions if pos < 3]))
        all_rank_min.append(1 + positions[0])
        all_rank_avg.append(1 + np.mean(positions))
        all_rank_med.append(1 + np.median(positions))

    mean_func = lambda x: round(np.array(x).astype(np.float).mean(), 4)
    eval_results = {
        'accuracy': mean_func(all_accuracy),
        'recall_at_3': mean_func(all_recall_at_3),
        'rank_min': mean_func(all_rank_min),
        'rank_avg': mean_func(all_rank_avg),
        'rank_med': mean_func(all_rank_med),
    }
    return eval_results
Esempio n. 3
0
    def __init__(self, vocab_file, max_size):
        """Creates a vocab of up to max_size words, reading from the vocab_file. If max_size is 0, reads the entire vocab file.

        Args:
          vocab_file: path to the vocab file, which is assumed to contain "<word> <frequency>" on each line, sorted with most frequent word first. This code doesn't actually use the frequencies, though.
          max_size: integer. The maximum size of the resulting Vocabulary."""
        self._word_to_id = {}
        self._id_to_word = {}
        self._count = 0  # keeps track of total number of words in the Vocab

        # [UNK], [PAD], [START] and [STOP] get the ids 0,1,2,3.
        for w in [UNKNOWN_TOKEN, PAD_TOKEN, START_DECODING, STOP_DECODING]:
            self._word_to_id[w] = self._count
            self._id_to_word[self._count] = w
            self._count += 1

        # Read the vocab file and add words up to max_size
        with file_io.FileIO(vocab_file, 'r') as vocab_f:
            for line in vocab_f:
                pieces = line.split()
                if len(pieces) != 2:
                    log.warn(
                        'Warning: incorrectly formatted line in vocabulary file: %s\n'
                        % line)
                    continue
                w = pieces[0]
                if w in [
                        SENTENCE_START, SENTENCE_END, UNKNOWN_TOKEN, PAD_TOKEN,
                        START_DECODING, STOP_DECODING
                ]:
                    raise Exception(
                        '<s>, </s>, [UNK], [PAD], [START] and [STOP] shouldn\'t be in the vocab file, but %s is'
                        % w)
                if w in self._word_to_id:
                    raise Exception('Duplicated word in vocabulary file: %s' %
                                    w)
                self._word_to_id[w] = self._count
                self._id_to_word[self._count] = w
                self._count += 1
                if max_size != 0 and self._count >= max_size:
                    log.info(
                        "max_size of vocab was specified as %i; we now have %i words. Stopping reading."
                        % (max_size, self._count))
                    break

        log.info(
            "Finished constructing vocabulary of %i total words. Last word added: %s"
            % (self._count, self._id_to_word[self._count - 1]))
Esempio n. 4
0
def __run_eval(model, data_dir, coverage, conf, batch_size):
    checkpoint_dir = os.path.join(
        conf.model_dir, 'eval')  # make a subdir of the root dir for eval data
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    # this is where checkpoints of best models are saved
    bestmodel_save_path = os.path.join(checkpoint_dir, 'bestmodel')
    best_loss = None
    step = 0
    seen_steps = set()
    do_eval = True
    with model.build_graph().as_default():
        ds = etl.dataset(data_dir, batch_size)
        iterator = ds.make_initializable_iterator()
        saver = tf.train.Saver(
            max_to_keep=3)  # we will keep 3 best checkpoints at a time
        summary_writer = tf.summary.FileWriter(checkpoint_dir)
        with tf.Session(config=conf.session_config) as sess:
            # run eval at least once and until all checkpoints are evaluated
            while do_eval:
                # load a new checkpoint from training
                util.load_ckpt(saver, sess, log_root=conf.model_dir)
                running_avg_loss = 0
                # init new epoch
                sess.run(iterator.initializer)
                next_batch = iterator.get_next()
                batch_count = 0
                t0 = time.time()
                try:
                    while True:
                        batch_t0 = time.time()
                        results = model.run_eval_step(sess, next_batch)
                        batch_t1 = time.time()
                        batch_count += 1
                        step = results['global_step']
                        if step in seen_steps:
                            do_eval = False
                            # this checkpoint has already been evaluated, do not save it.
                            running_avg_loss = 9999
                            break
                        loss = results['loss']
                        if not np.isfinite(loss):
                            log.warn('loss is nan. Skip batch {}'.format(
                                batch_count))
                            continue
                        summaries = results['summaries']
                        summary_writer.add_summary(summaries, step)
                        # calculate running avg loss
                        running_avg_loss = calc_running_avg_loss(
                            np.asscalar(loss), running_avg_loss,
                            summary_writer, step)
                        msg = 'eval step={}, batch={}, ra_loss={:.4f}, loss={:.4f}, secs={}'.format(
                            step, batch_count, running_avg_loss, loss,
                            int(batch_t1 - batch_t0))
                        if coverage:
                            coverage_loss = results['coverage_loss']
                            msg += ", coverage_loss={:.4f}".format(
                                coverage_loss)
                        log.info(msg)
                        # flush the summary writer every so often
                        if batch_count % 10 == 0:
                            summary_writer.flush()
                except tf.errors.OutOfRangeError:
                    seen_steps.add(step)
                    t1 = time.time()
                    mins = int((t1 - t0) / 60)
                    log.info('eval end of epoch, mins={}'.format(mins))
                finally:
                    summary_writer.flush()
                    # If running_avg_loss is best so far, save this checkpoint (early stopping).
                    # These checkpoints will appear as bestmodel-<iteration_number> in the eval dir
                    if best_loss is None or running_avg_loss < best_loss:
                        best_loss = running_avg_loss
                        log.info(
                            'eval Found new best model with %.4f running_avg_loss. Saving %s',
                            best_loss, bestmodel_save_path)
                        saver.save(sess,
                                   bestmodel_save_path,
                                   global_step=step,
                                   latest_filename='checkpoint_best')
    log.info('eval done')
Esempio n. 5
0
def run(args):
    # collect preprocessed data
    input_files = [args.questions, args.non_questions]
    for p in input_files:
        assert os.path.exists(p)

    if not os.path.exists(args.output_file_name) or args.overwrite:

        questions, non_questions = gather_stack_exchange_from_file(
            *input_files)
        glove_vocab, glove_vectors, glove_word2index = process_glove(
            glove_file=args.glove_path)

        (word2index, index2word, vectors,
         training_vocab) = set_data_to_new_vocab(questions, non_questions,
                                                 glove_vectors,
                                                 glove_word2index)
        if args.static_data:
            features, labels = create_static_features(
                questions,
                non_questions,
                word2index,
                size=args.num_samples,
                max_seq_length=args.max_seq_length,
                max_num_questions=args.max_num_questions,
                max_num_elements=args.max_elements,
                randomize_num_questions=args.rand_num_questions)
            data = StaticDataContainer(questions, non_questions,
                                       training_vocab, vectors, word2index,
                                       index2word, features, labels,
                                       args.num_samples, args.mini_num_samples,
                                       args.test_fraction, args.max_seq_length,
                                       args.max_elements, args.description)
            with open(args.output_file_name + '.static.pkl',
                      'wb+') as container:
                pickle.dump(data, container, protocol=-1)

        if args.gen_data:
            data = GeneratorDataContainer(
                questions,
                non_questions,
                training_vocab,
                vectors,
                word2index,
                index2word,
                args.max_seq_length,
                args.max_elements,
                args.max_num_questions,
                args.description,
                test_size=int(args.test_fraction * 10000),
                randomize_num_questions=args.rand_num_questions)
            with open(args.output_file_name + '.generator.pkl',
                      'wb+') as container:
                pickle.dump(data, container, protocol=-1)

    else:
        logging.warn(
            'pickle files already exist, set overwrite to true if you want to refresh data; its randomly created each time'
        )
        sys.exit()

    return data
Esempio n. 6
0
    parser.add_argument('-z', '--from-scratch', action='store_true')  # noqa
    parser.add_argument('--static-data', action='store_true')
    parser.add_argument('--gen-data', action='store_true')
    parser.add_argument('-c', '--description', type=str, default=None)
    args = parser.parse_args()

    assert args.num_samples > args.mini_num_samples, "You must select more samples than mini_num_samples"
    assert args.static_data or args.gen_data, 'Need to choose static or generator type dataset'
    if args.static_data:
        assert int(
            args.num_samples - (args.num_samples * args.test_fraction)
        ) > args.mini_num_samples, "Set test fraction lower or set num samples higher"

    if os.path.exists(args.output_file_name) and not args.overwrite:
        logging.warn(
            ' mini pickle files already exist, set overwrite to true if you want to refresh data; its randomly created each time'
        )
        sys.exit()

    if args.from_scratch:
        directory_str = os.path.join('.', 'stack_exchange', STACKS[2])
        filename = 'Comments.xml'
        _, _ = gather_stack_exchange_data_from_scratch(directory_str,
                                                       filename,
                                                       XMLS_DICT,
                                                       write=True)

    data_obj = run(args)
    if args.supress_meta:
        data_obj.display_metadata()
Esempio n. 7
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)

    start_new_model = FLAGS.start_new_model
    output_dir = FLAGS.output_dir

    init_learning_rate = FLAGS.init_learning_rate
    decay_steps = FLAGS.decay_steps
    decay_rate = FLAGS.decay_rate
    l1_reg_rate = FLAGS.l1_reg_rate
    l2_reg_rate = FLAGS.l2_reg_rate
    is_bootstrap = FLAGS.is_bootstrap
    train_epochs = FLAGS.train_epochs

    model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes
    reader = get_reader(model_type, feature_names, feature_sizes)
    train_data_pattern = FLAGS.train_data_pattern
    validate_data_pattern = FLAGS.validate_data_pattern
    batch_size = FLAGS.batch_size
    num_readers = FLAGS.num_readers

    if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')):
        with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f:
            validate_data = pickle.load(f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f:
            validate_labels = pickle.load(f)
    else:
        # Increase num_readers.
        validate_data_pipeline = DataPipeline(
            reader=reader,
            data_pattern=validate_data_pattern,
            batch_size=batch_size,
            num_readers=num_readers)

        # Sample validate set.
        _, validate_data, validate_labels, _ = random_sample(
            0.05,
            mask=(False, True, True, False),
            data_pipeline=validate_data_pipeline,
            name_scope='sample_validate')
        with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f:
            pickle.dump(validate_data, f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f:
            pickle.dump(validate_labels, f)

    train_data_pipeline = DataPipeline(reader=reader,
                                       data_pattern=train_data_pattern,
                                       batch_size=batch_size,
                                       num_readers=num_readers)

    model_save_path = path_join(output_dir, 'mlp_fuse')
    if start_new_model and tf.gfile.Exists(model_save_path):
        logging.info('Starting a new model...')
        # Start new model, delete existing checkpoints.
        try:
            tf.gfile.DeleteRecursively(model_save_path)
        except tf.errors.OpError:
            logging.error('Failed to delete dir {}.'.format(model_save_path))
        else:
            logging.info(
                'Succeeded to delete train dir {}.'.format(model_save_path))

    # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers.
    try:
        # Load sum_labels in training set, numpy float format to compute pos_weights.
        train_sum_labels = load_sum_labels()
        # num_neg / num_pos, assuming neg_weights === 1.0.
        pos_weights = np.sqrt(
            float(NUM_TRAIN_EXAMPLES) / train_sum_labels - 1.0)
        logging.info(
            'Computing pos_weights based on sum_labels in train set successfully.'
        )
    except IOError:
        logging.error('Cannot load train sum_labels. Use default value.')
        pos_weights = None
    finally:
        logging.warn('Not to use positive weights.')
        pos_weights = None

    train(train_data_pipeline,
          epochs=train_epochs,
          pos_weights=pos_weights,
          l1_reg_rate=l1_reg_rate,
          l2_reg_rate=l2_reg_rate,
          init_learning_rate=init_learning_rate,
          bootstrap=is_bootstrap,
          validate_set=(validate_data, validate_labels),
          validate_fn=gap_fn,
          logdir=model_save_path)
Esempio n. 8
0
    def fit(self,
            train_data_pipeline,
            start_new_model=False,
            tr_data_fn=None,
            tr_data_paras=None,
            validate_set=None,
            validate_fn=None,
            bootstrap=False,
            init_learning_rate=0.01,
            decay_steps=40000,
            decay_rate=0.95,
            epochs=None,
            l1_reg_rate=None,
            l2_reg_rate=0.01,
            pos_weights=None,
            initial_weights=None,
            initial_biases=None):
        """
        Logistic regression fit function.
        Args:
            train_data_pipeline: A namedtuple consisting of reader, data_pattern, batch_size and num_readers.
            start_new_model: If True, start a new model instead of restoring from existing checkpoints.
            tr_data_fn: a function that transforms input data.
            tr_data_paras: Other parameters should be passed to tr_data_fn. A dictionary.
            validate_set: If not None, check validation loss regularly. Else, ignored.
            validate_fn: The function to check the performance of learned model parameters on validate set.
            bootstrap: If True, sampling training examples with replacement by differential weighting.
            init_learning_rate: Decayed gradient descent parameter.
            decay_steps: Decayed gradient descent parameter.
            decay_rate: Decayed gradient descent parameter.
            epochs: Maximal epochs to use.
            l1_reg_rate: None, not impose l1 regularization.
            l2_reg_rate: l2 regularization rate.
            pos_weights: For imbalanced binary classes. Here, num_pos << num_neg, the weights should be > 1.0.
                If None, treated as 1.0 for all binary classifiers.
            initial_weights: If not None, the weights will be initialized with it.
            initial_biases: If not None, the biases will be initialized with it.
        Returns: None.
        """
        reader = train_data_pipeline.reader
        batch_size = train_data_pipeline.batch_size
        num_classes = reader.num_classes
        feature_names = reader.feature_names
        feature_sizes = reader.feature_sizes
        logging.info(
            'Logistic regression uses {} features with dims {}.'.format(
                feature_names, feature_sizes))

        raw_feature_size = sum(feature_sizes)

        self.train_data_pipeline = train_data_pipeline
        self.raw_feature_size = raw_feature_size
        self.feature_size = raw_feature_size
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.tr_data_fn = tr_data_fn
        self.tr_data_paras = tr_data_paras
        self.bootstrap = bootstrap
        self.init_learning_rate = init_learning_rate
        self.decay_steps = decay_steps
        self.decay_rate = decay_rate
        self.epochs = epochs
        self.l1_reg_rate = l1_reg_rate
        self.l2_reg_rate = l2_reg_rate
        self.pos_weights = pos_weights
        self.initial_weights = initial_weights
        self.initial_biases = initial_biases

        # Check extra data transform function arguments.
        # If transform changes the features size, change it.
        if self.tr_data_fn is not None:
            if self.tr_data_paras is None:
                self.tr_data_paras = dict()
            else:
                if ('reshape' in self.tr_data_paras) and (
                        self.tr_data_paras['reshape'] is True):
                    self.feature_size = self.tr_data_paras['size']
                    logging.warn(
                        'Data transform changes the features size to {}.'.
                        format(self.feature_size))

            logging.debug('Data transform arguments are {}.'.format(
                self.tr_data_paras))
        else:
            self.tr_data_paras = dict()

        start_new_model = start_new_model or (not tf.gfile.Exists(self.logdir))

        # This is NECESSARY to avoid contaminating default graph.
        # Alternatively, we can define a member graph variable. When building a new graph or
        # restoring a graph, wrap the code into a similar contextmanager.
        self.graph = tf.Graph()
        with self.graph.as_default():
            if start_new_model:
                logging.info('Starting a new model...')
                # Start new model, delete existing checkpoints.
                if tf.gfile.Exists(self.logdir):
                    try:
                        tf.gfile.DeleteRecursively(self.logdir)
                    except tf.errors.OpError:
                        logging.error('Failed to delete dir {}.'.format(
                            self.logdir))
                    else:
                        logging.info(
                            'Succeeded to delete train dir {}.'.format(
                                self.logdir))
                else:
                    # Do nothing.
                    pass

                # Build graph, namely building a graph and initialize member variables associated with graph.
                self.saver = self._build_graph()
            else:
                self.saver = self._restore_graph()

            # After either building a graph or restoring a graph, graph is CONSTRUCTED successfully.
            # Get collections to be used in training.
            self.global_step = tf.get_collection('global_step')[0]
            self.init_op = tf.get_collection('init_op')[0]
            self.train_op = tf.get_collection('train_op')[0]
            self.summary_op = tf.get_collection('summary_op')[0]
            self.raw_features_batch = tf.get_collection(
                'raw_features_batch')[0]
            self.labels_batch = tf.get_collection('labels_batch')[0]
            self.loss = tf.get_collection('loss')[0]
            self.pred_prob = tf.get_collection('predictions')[0]

        if self._check_graph_initialized():
            logging.info('Succeeded to initialize logistic regression Graph.')
        else:
            logging.error('Failed to initialize logistic regression Graph.')

        # Start or restore training.
        # To avoid summary causing memory usage peak, manually save summaries.
        sv = tf.train.Supervisor(graph=self.graph,
                                 init_op=self.init_op,
                                 logdir=self.logdir,
                                 global_step=self.global_step,
                                 summary_op=None,
                                 save_model_secs=600,
                                 saver=self.saver)

        with sv.managed_session() as sess:
            logging.info("Entering training loop...")
            for step in range(self.max_train_steps):
                if sv.should_stop():
                    # Save the final model and break.
                    self.saver.save(sess,
                                    save_path='{}_{}'.format(
                                        sv.save_path, 'final'))
                    break

                if step % 500 == 0:
                    if validate_fn is not None:
                        _, summary, train_pred_prob_batch, train_labels_batch, global_step_val = sess.run(
                            [
                                self.train_op, self.summary_op, self.pred_prob,
                                self.labels_batch, self.global_step
                            ])

                        # Evaluate on train data.
                        train_per = validate_fn(
                            predictions=train_pred_prob_batch,
                            labels=train_labels_batch)
                        sv.summary_writer.add_summary(
                            MakeSummary(
                                'train/{}'.format(validate_fn.func_name),
                                train_per), global_step_val)
                        logging.info('Step {}, train {}: {}.'.format(
                            global_step_val, validate_fn.func_name, train_per))
                    else:
                        _, summary, global_step_val = sess.run(
                            [self.train_op, self.summary_op, self.global_step])

                    # Add train summary.
                    sv.summary_computed(sess,
                                        summary,
                                        global_step=global_step_val)

                    # Compute validate loss and performance (validate_fn).
                    if validate_set is not None:
                        validate_data, validate_labels = validate_set

                        # Compute validation loss.
                        num_validate_videos = validate_data.shape[0]
                        split_indices = np.linspace(
                            0,
                            num_validate_videos + 1,
                            num=max(
                                num_validate_videos // (2 * batch_size) + 1,
                                2),
                            dtype=np.int32)

                        validate_loss_vals, predictions = [], []
                        for i in range(len(split_indices) - 1):
                            start_ind = split_indices[i]
                            end_ind = split_indices[i + 1]

                            if validate_fn is not None:
                                ith_validate_loss_val, ith_predictions = sess.run(
                                    [self.loss, self.pred_prob],
                                    feed_dict={
                                        self.raw_features_batch:
                                        validate_data[start_ind:end_ind],
                                        self.labels_batch:
                                        validate_labels[start_ind:end_ind]
                                    })

                                validate_loss_vals.append(
                                    ith_validate_loss_val *
                                    (end_ind - start_ind))
                                predictions.append(ith_predictions)
                            else:
                                ith_validate_loss_val = sess.run(
                                    self.loss,
                                    feed_dict={
                                        self.raw_features_batch:
                                        validate_data[start_ind:end_ind],
                                        self.labels_batch:
                                        validate_labels[start_ind:end_ind]
                                    })

                                validate_loss_vals.append(
                                    ith_validate_loss_val *
                                    (end_ind - start_ind))

                        validate_loss_val = sum(
                            validate_loss_vals) / num_validate_videos
                        # Add validate summary.
                        sv.summary_writer.add_summary(
                            MakeSummary('validate/xentropy',
                                        validate_loss_val), global_step_val)

                        if validate_fn is not None:
                            validate_per = validate_fn(
                                predictions=np.concatenate(predictions,
                                                           axis=0),
                                labels=validate_labels)

                            sv.summary_writer.add_summary(
                                MakeSummary(
                                    'validate/{}'.format(
                                        validate_fn.func_name), validate_per),
                                global_step_val)
                            logging.info('Step {}, validate {}: {}.'.format(
                                global_step_val, validate_fn.func_name,
                                validate_per))

                elif step % 200 == 0:
                    _, summary, global_step_val = sess.run(
                        [self.train_op, self.summary_op, self.global_step])
                    sv.summary_computed(sess,
                                        summary,
                                        global_step=global_step_val)
                else:
                    sess.run(self.train_op)

            logging.info("Exited training loop.")

        # Session will close automatically when with clause exits.
        # sess.close()
        sv.stop()
Esempio n. 9
0
    def fit(self,
            data_pipeline=None,
            tr_data_fn=None,
            tr_data_paras=None,
            l2_regs=None,
            validate_set=None,
            line_search=True):
        """
        Compute weights and biases of linear classifier using normal equation. With line search for best l2_reg.
        Args:
            data_pipeline: A namedtuple consisting of the following elements.
                reader, video-level features reader or frame-level features reader.
                data_pattern, File Glob of data set.
                batch_size, How many examples to handle per time.
                num_readers, How many IO threads to prefetch examples.
            tr_data_fn: a function that transforms input data.
            tr_data_paras: Other parameters should be passed to tr_data_fn. A dictionary.
            l2_regs: An array, each element represents how much the linear classifier weights should be penalized.
            validate_set: (data, labels) with dtype float32. The data set (numpy arrays) used to choose the best l2_reg.
                Sampled from whole validate set if necessary. If line_search is False, this argument is simply ignored.
            line_search: Boolean argument representing whether to do boolean search.

        Returns: Weights and biases fit on the given data set, where biases are appended as the last row.

        """
        logging.info('Entering linear classifier ...')

        batch_size = data_pipeline.batch_size
        reader = data_pipeline.reader
        num_classes = reader.num_classes
        feature_names = reader.feature_names
        feature_sizes = reader.feature_sizes
        raw_feature_size = sum(feature_sizes)
        feature_size = raw_feature_size
        logging.info('Linear regression uses {} features with dims {}.'.format(
            feature_names, feature_sizes))

        if line_search:
            # Both l2_regs and validate_set are required.
            if l2_regs is None:
                raise ValueError('There is no l2_regs to do line search.')
            else:
                logging.info('l2_regs is {}.'.format(l2_regs))

            if validate_set is None:
                raise ValueError(
                    'There is no validate_set to do line search for l2_reg.')
            else:
                validate_data, validate_labels = validate_set

        else:
            # Simply fit the training set. Make l2_regs have only one element. And ignore validate_set.
            if l2_regs is None:
                l2_regs = [0.001]
            elif isinstance(l2_regs, float):
                l2_regs = [l2_regs]
            elif isinstance(l2_regs, list) or isinstance(l2_regs, tuple):
                l2_regs = l2_regs[:1]
            logging.info('No line search, l2_regs is {}.'.format(l2_regs))
            if validate_set is None:
                # Important! To make the graph construction successful.
                validate_data = np.zeros([1, raw_feature_size],
                                         dtype=np.float32)
                validate_labels = np.zeros([1, num_classes], dtype=np.float32)
            else:
                validate_data, validate_labels = validate_set

        # Check validate data and labels shape.
        logging.info(
            'validate set: data has shape {}, labels has shape {}.'.format(
                validate_data.shape, validate_labels.shape))
        if (validate_data.shape[-1] != raw_feature_size) or (
                validate_labels.shape[-1] != num_classes):
            raise ValueError(
                'validate set shape does not conforms with training set.')

        # TO BE CAUTIOUS! THE FOLLOWING MAY HAVE TO DEAL WITH FEATURE SIZE CHANGE.
        # Check extra data transform function arguments.
        # If transform changes the features size, change it.
        if tr_data_fn is not None:
            if tr_data_paras is None:
                tr_data_paras = {}
            else:
                if ('reshape' in tr_data_paras) and (tr_data_paras['reshape']
                                                     is True):
                    feature_size = tr_data_paras['size']
                    logging.warn(
                        'Data transform changes the features size to {}.'.
                        format(feature_size))

        # Method - append an all-one col to X by using block matrix multiplication (all-one col is treated as a block).
        # Create the graph to traverse all data once.
        with tf.Graph().as_default() as graph:
            global_step = tf.Variable(initial_value=0,
                                      trainable=False,
                                      dtype=tf.int32,
                                      name='global_step')
            global_step_inc_op = tf.assign_add(global_step, 1)

            # X.transpose * X
            norm_equ_1_initializer = tf.placeholder(
                tf.float32, shape=[feature_size, feature_size])
            norm_equ_1 = tf.Variable(initial_value=norm_equ_1_initializer,
                                     collections=[],
                                     name='X_Tr_X')

            # X.transpose * Y
            norm_equ_2_initializer = tf.placeholder(
                tf.float32, shape=[feature_size, num_classes])
            norm_equ_2 = tf.Variable(initial_value=norm_equ_2_initializer,
                                     collections=[],
                                     name='X_Tr_Y')

            example_count = tf.Variable(initial_value=0.0,
                                        name='example_count')
            features_sum = tf.Variable(initial_value=tf.zeros([feature_size]),
                                       name='features_sum')
            labels_sum = tf.Variable(initial_value=tf.zeros([num_classes]),
                                     name='labels_sum')

            id_batch, raw_features_batch, labels_batch, num_frames_batch = (
                get_input_data_tensors(data_pipeline,
                                       num_epochs=1,
                                       name_scope='input'))
            if tr_data_fn is None:
                transformed_features_batch = tf.identity(raw_features_batch)
            else:
                transformed_features_batch = tr_data_fn(
                    raw_features_batch, **tr_data_paras)

            with tf.name_scope('batch_increment'):
                transformed_features_batch_tr = tf.matrix_transpose(
                    transformed_features_batch, name='X_Tr')
                float_labels_batch = tf.cast(labels_batch, tf.float32)
                batch_norm_equ_1 = tf.matmul(transformed_features_batch_tr,
                                             transformed_features_batch,
                                             name='batch_norm_equ_1')
                # batch_norm_equ_1 = tf.add_n(tf.map_fn(lambda x: tf.einsum('i,j->ij', x, x),
                #                                       transformed_features_batch_tr), name='X_Tr_X')
                batch_norm_equ_2 = tf.matmul(transformed_features_batch_tr,
                                             float_labels_batch,
                                             name='batch_norm_equ_2')
                batch_example_count = tf.cast(
                    tf.shape(transformed_features_batch)[0],
                    tf.float32,
                    name='batch_example_count')
                batch_features_sum = tf.reduce_sum(transformed_features_batch,
                                                   axis=0,
                                                   name='batch_features_sum')
                batch_labels_sum = tf.reduce_sum(float_labels_batch,
                                                 axis=0,
                                                 name='batch_labels_sum')

            with tf.name_scope('update_ops'):
                update_norm_equ_1_op = tf.assign_add(norm_equ_1,
                                                     batch_norm_equ_1)
                update_norm_equ_2_op = tf.assign_add(norm_equ_2,
                                                     batch_norm_equ_2)
                update_example_count = tf.assign_add(example_count,
                                                     batch_example_count)
                update_features_sum = tf.assign_add(features_sum,
                                                    batch_features_sum)
                update_labels_sum = tf.assign_add(labels_sum, batch_labels_sum)

            with tf.control_dependencies([
                    update_norm_equ_1_op, update_norm_equ_2_op,
                    update_example_count, update_features_sum,
                    update_labels_sum, global_step_inc_op
            ]):
                update_equ_non_op = tf.no_op(name='unified_update_op')

            with tf.name_scope('solution'):
                # After all data being handled, compute weights.
                l2_reg_ph = tf.placeholder(tf.float32, shape=[])
                l2_reg_term = tf.diag(tf.fill([feature_size], l2_reg_ph),
                                      name='l2_reg')
                # X.transpose * X + lambda * Id, where d is the feature dimension.
                norm_equ_1_with_reg = tf.add(norm_equ_1, l2_reg_term)

                # Concat other blocks to form the final norm equation terms.
                final_norm_equ_1_top = tf.concat(
                    [norm_equ_1_with_reg,
                     tf.expand_dims(features_sum, 1)], 1)
                final_norm_equ_1_bot = tf.concat(
                    [features_sum,
                     tf.expand_dims(example_count, 0)], 0)
                final_norm_equ_1 = tf.concat([
                    final_norm_equ_1_top,
                    tf.expand_dims(final_norm_equ_1_bot, 0)
                ],
                                             0,
                                             name='norm_equ_1')
                final_norm_equ_2 = tf.concat(
                    [norm_equ_2, tf.expand_dims(labels_sum, 0)],
                    0,
                    name='norm_equ_2')

                # The last row is the biases.
                weights_biases = tf.matrix_solve(final_norm_equ_1,
                                                 final_norm_equ_2,
                                                 name='weights_biases')

                weights = weights_biases[:-1]
                biases = weights_biases[-1]

            with tf.name_scope('validate_loss'):
                validate_x_pl = tf.placeholder(tf.float32,
                                               shape=[None, raw_feature_size],
                                               name='validate_data')

                validate_y_pl = tf.placeholder(tf.float32,
                                               shape=[None, num_classes],
                                               name='validate_labels')

                if tr_data_fn is None:
                    validate_x_transformed = tf.identity(validate_x_pl)
                else:
                    validate_x_transformed = tr_data_fn(validate_x_pl,
                                                        reuse=True,
                                                        **tr_data_paras)

                predictions = tf.matmul(validate_x_transformed,
                                        weights) + biases
                loss = tf.sqrt(tf.reduce_mean(
                    tf.squared_difference(predictions, validate_y_pl)),
                               name='rmse')
                # pred_labels = tf.greater_equal(predictions, 0.0, name='pred_labels')

            summary_op = tf.summary.merge_all()

            init_op = tf.group(tf.global_variables_initializer(),
                               tf.local_variables_initializer(),
                               name='init_glo_loc_var')

        sess = tf.Session(graph=graph)
        # Initialize variables.
        sess.run(init_op)
        sess.run(
            [norm_equ_1.initializer, norm_equ_2.initializer],
            feed_dict={
                norm_equ_1_initializer:
                np.zeros([feature_size, feature_size], dtype=np.float32),
                norm_equ_2_initializer:
                np.zeros([feature_size, num_classes], dtype=np.float32)
            })

        summary_writer = tf.summary.FileWriter(self.logdir, graph=sess.graph)

        # Start input enqueue threads.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                _, summary, global_step_val = sess.run(
                    [update_equ_non_op, summary_op, global_step])
                summary_writer.add_summary(summary,
                                           global_step=global_step_val)
        except tf.errors.OutOfRangeError:
            logging.info(
                'Finished normal equation terms computation -- one epoch done.'
            )
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()
            summary_writer.close()

        # Wait for threads to finish.
        coord.join(threads)

        # Do line search.
        best_weights_val, best_biases_val = None, None
        best_l2_reg = 0
        min_loss = np.PINF

        for l2_reg in l2_regs:
            # Compute regularized weights.
            weights_val, biases_val = sess.run([weights, biases],
                                               feed_dict={l2_reg_ph: l2_reg})
            # Compute validation loss.
            num_validate_videos = validate_data.shape[0]
            split_indices = np.linspace(
                0,
                num_validate_videos + 1,
                num=max(num_validate_videos // batch_size + 1, 2),
                dtype=np.int32)
            loss_vals = []
            for i in range(len(split_indices) - 1):
                start_ind = split_indices[i]
                end_ind = split_indices[i + 1]

                # Avoid re-computing weights and biases (Otherwise, l2_reg_ph is necessary).
                ith_loss_val = sess.run(loss,
                                        feed_dict={
                                            validate_x_pl:
                                            validate_data[start_ind:end_ind],
                                            validate_y_pl:
                                            validate_labels[start_ind:end_ind],
                                            weights:
                                            weights_val,
                                            biases:
                                            biases_val
                                        })

                loss_vals.append(ith_loss_val * (end_ind - start_ind))

            validate_loss_val = sum(loss_vals) / num_validate_videos

            logging.info('l2_reg {} leads to rmse loss {}.'.format(
                l2_reg, validate_loss_val))
            if validate_loss_val < min_loss:
                best_weights_val, best_biases_val = weights_val, biases_val
                min_loss = validate_loss_val
                best_l2_reg = l2_reg

        sess.close()

        if (not line_search) and (validate_set is None):
            min_loss = None

        logging.info('The best l2_reg is {} with rmse loss {}.'.format(
            best_l2_reg, min_loss))
        logging.info('Exiting linear classifier ...')

        self.weights = best_weights_val
        self.biases = best_biases_val
        self.rmse = min_loss
Esempio n. 10
0
def _get_data_placeholders(config, split):
    """Returns data placeholder to feed the dataset.

  Args:
    config: an instance of ads_mem_examples_pb2.AdsMemExamples.

  Returns:
    data_placeholders: a dict mapping from name to placeholders.
    feed_dict: a dict mapping from name to data.
  """
    # Create placeholders.
    data_placeholders = {
        'image_id':
        tf.placeholder(tf.string, [None]),
        'img_features':
        tf.placeholder(tf.float32, [None, config.feature_dimensions]),
        'roi_features':
        tf.placeholder(
            tf.float32,
            [None, config.number_of_regions, config.feature_dimensions]),
        'number_of_statements':
        tf.placeholder(tf.int32, [None]),
        'statement_strings':
        tf.placeholder(
            tf.int32, [None, config.max_stmts_per_image, config.max_stmt_len]),
        'statement_lengths':
        tf.placeholder(tf.int32, [None, config.max_stmts_per_image]),
        'number_of_symbols':
        tf.placeholder(tf.int32, [None]),
        'symbols':
        tf.placeholder(tf.int32, [None, config.max_symbols_per_image]),
    }
    if not config.use_single_densecap:
        data_placeholders.update({
            'number_of_densecaps':
            tf.placeholder(tf.int32, [None]),
            'densecap_strings':
            tf.placeholder(tf.int32, [
                None, config.max_densecaps_per_image, config.max_densecap_len
            ]),
            'densecap_lengths':
            tf.placeholder(tf.int32, [None, config.max_densecaps_per_image]),
        })
    else:
        data_placeholders.update({
            'number_of_densecaps':
            tf.placeholder(tf.int32, [None]),
            'densecap_strings':
            tf.placeholder(tf.int32, [
                None, 1,
                config.max_densecaps_per_image * config.max_densecap_len
            ]),
            'densecap_lengths':
            tf.placeholder(tf.int32, [None, 1]),
        })

    if split != 'train':
        data_placeholders.update({
            'eval_statement_strings':
            tf.placeholder(tf.int32, [
                None, config.number_of_val_stmts_per_image, config.max_stmt_len
            ]),
            'eval_statement_lengths':
            tf.placeholder(tf.int32,
                           [None, config.number_of_val_stmts_per_image]),
        })

    # Load annotations and image features.
    assert tf.gfile.Exists(config.image_feature_path)
    assert tf.gfile.Exists(config.region_feature_path)
    assert tf.gfile.Exists(config.statement_vocab_path)
    assert tf.gfile.Exists(config.statement_annot_path)
    assert tf.gfile.Exists(config.densecap_vocab_path)
    assert tf.gfile.Exists(config.densecap_annot_path)
    assert tf.gfile.Exists(config.symbol_annot_path)
    assert tf.gfile.Exists(config.symbol_cluster_path)

    # Image features.
    start = time.time()
    image_features = np.load(config.image_feature_path).item()
    region_features = np.load(config.region_feature_path).item()
    logging.info(
        'Image features are loaded, cost=%is, img_len=%i, roi_len=%i.',
        time.time() - start, len(image_features), len(region_features))

    # Action-reason annotations.
    start = time.time()
    stmt_annots = load_action_reason_annots(config.statement_annot_path)
    logging.info('Annotations are loaded, cost=%is, path=%s, len=%i.',
                 time.time() - start, config.statement_annot_path,
                 len(stmt_annots))

    stmt_vocab = load_vocab(config.statement_vocab_path)
    logging.info('Load vocab from %s, vocab_size=%i',
                 config.statement_vocab_path, len(stmt_vocab))

    # Densecap annotations.
    start = time.time()
    dense_annots = load_densecap_annots(config.densecap_annot_path,
                                        config.max_densecaps_per_image)
    logging.info('Dense annotations are loaded, cost=%is, path=%s, len=%i.',
                 time.time() - start, config.densecap_annot_path,
                 len(dense_annots))

    dense_vocab = load_vocab(config.densecap_vocab_path)
    logging.info('Load vocab from %s, vocab_size=%i',
                 config.densecap_vocab_path, len(dense_vocab))

    # Symbol annotations.
    start = time.time()
    symbol_annots = load_raw_annots(config.symbol_annot_path)
    logging.info('Symbol annotations are loaded, cost=%is, path=%s, len=%i.',
                 time.time() - start, config.symbol_annot_path,
                 len(symbol_annots))
    word_to_id, id_to_symbol = load_symbol_cluster(config.symbol_cluster_path)

    # Initialize feed_dict.
    feed_dict = {
        'image_id': [],
        'img_features': [],
        'roi_features': [],
        'number_of_statements': [],
        'statement_strings': [],
        'statement_lengths': [],
        'number_of_densecaps': [],
        'densecap_strings': [],
        'densecap_lengths': [],
        'number_of_symbols': [],
        'symbols': [],
    }
    if split != 'train':
        feed_dict.update({
            'eval_statement_strings': [],
            'eval_statement_lengths': [],
        })

    total_images = total_statements = 0

    # Split training data for validation purpose.
    stmt_annots = stmt_annots.items()
    if split == 'valid':
        stmt_annots = stmt_annots[:config.number_of_val_examples]
    elif split == 'train':
        stmt_annots = stmt_annots[config.number_of_val_examples:]
    logging.info('Processing %i %s records...', len(stmt_annots), split)

    if config.debug:
        logging.warn('DEBUG MODE!!!!!!!')
        stmt_annots = stmt_annots[:100]

    for index, (image_id, annot) in enumerate(stmt_annots):
        # Pad action-reason.
        (number_of_statements, statement_strings,
         statement_lengths) = encode_and_pad_sentences(
             stmt_vocab, annot['pos_examples'], config.max_stmts_per_image,
             config.max_stmt_len)

        # Pad densecap.
        if not config.use_single_densecap:
            (number_of_densecaps, densecap_strings,
             densecap_lengths) = encode_and_pad_sentences(
                 dense_vocab, dense_annots[image_id],
                 config.max_densecaps_per_image, config.max_densecap_len)
        else:  # Concatenate all densecaps to form a single sentence.
            dense_string_concat = ' '.join(dense_annots[image_id])
            (number_of_densecaps, densecap_strings,
             densecap_lengths) = encode_and_pad_sentences(
                 dense_vocab, [dense_string_concat], 1,
                 config.max_densecap_len * config.max_densecaps_per_image)

        # Pad symbols.
        symbols = symbol_annots.get(image_id, [])
        number_of_symbols = len(symbols)
        symbols += [0] * config.max_symbols_per_image
        symbols = symbols[:config.max_symbols_per_image]

        feed_dict['image_id'].append(image_id)
        feed_dict['img_features'].append(image_features[image_id])
        feed_dict['roi_features'].append(region_features[image_id])
        feed_dict['number_of_statements'].append(
            np.array(number_of_statements, dtype=np.int32))
        feed_dict['statement_strings'].append(statement_strings)
        feed_dict['statement_lengths'].append(statement_lengths)
        feed_dict['number_of_densecaps'].append(
            np.array(number_of_densecaps, dtype=np.int32))
        feed_dict['densecap_strings'].append(densecap_strings)
        feed_dict['densecap_lengths'].append(densecap_lengths)
        feed_dict['number_of_symbols'].append(
            np.array(number_of_symbols, dtype=np.int32))
        feed_dict['symbols'].append(np.array(symbols))

        if split != 'train':
            # Pad strings for evaluation purpose.
            (number_of_eval_statements, eval_statement_strings,
             eval_statement_lengths) = encode_and_pad_sentences(
                 stmt_vocab, annot['all_examples'],
                 config.number_of_val_stmts_per_image, config.max_stmt_len)
            assert number_of_eval_statements == config.number_of_val_stmts_per_image
            feed_dict['eval_statement_strings'].append(eval_statement_strings)
            feed_dict['eval_statement_lengths'].append(eval_statement_lengths)

        total_images += 1
        total_statements += number_of_statements

        if index % 1000 == 0:
            logging.info('Load on %i/%i', index, len(stmt_annots))

    logging.info('Load %i images with %i statements.', total_images,
                 total_statements)

    # Legacy: GPU or CPU mode.
    if config.data_provider_mode == ads_mem_examples_pb2.AdsMemExamples.FROM_CPU:
        for k, v in feed_dict.items():
            feed_dict[data_placeholders[k]] = np.stack(v)
            del feed_dict[k]
        return data_placeholders, feed_dict


#  elif config.data_provider_mode == ads_mem_examples_pb2.AdsMemExamples.FROM_GPU:
#    data_tensors = {}
#    for k, v in feed_dict.items():
#      data_tensors[k] = tf.constant(np.stack(v))
#    return data_tensors, {}

    raise ValueError('Unknown mode %i.' % config.data_provider_mode)