コード例 #1
0
    def __init__(self, cnn_model_path, source_path, target_path, vocab_path,
                 sent_len, labeled_save_dir):
        """

        :param cnn_model_path: Path to a trained cnn model.
        :param source_path: Path to instance data, the latter part of which will be labeled during active learning.
        :param target_path: Path to labels for already labeled part of the data.
        :param vocab_path: Path to vocab file.
        :param labeled_save_dir: Directory to which the labeled files will be stored.
        """
        unlabeled_data = util.read_data_unlabeled_part(source_path,
                                                       target_path,
                                                       sent_len,
                                                       shuffle=False)
        self.unlabeled_data = np.array(unlabeled_data)
        self.data_size = self.unlabeled_data.shape[0]

        self.labeled_data, self.labeled_result = util.read_data_labeled_part(
            source_path, target_path, sent_len, shuffle=False)

        sentence_indices_input = self.unlabeled_data[:, :-2]
        self.vocab_path = vocab_path
        _, rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path)
        self.sentence_input = preprocessing_util.indices_to_sentences(
            sentence_indices_input, rev_vocab)
        self.kp_indices_input = self.unlabeled_data[:, -2:]

        for i, sentence in enumerate(self.sentence_input):
            # Label the key phrases of interest in the current sentence with *.
            sentence[self.kp_indices_input[i, 0]] += '*'
            sentence[self.kp_indices_input[i, 1]] += '*'

        self.update_labeled_save_dir(labeled_save_dir)
        # self.labeled_save_dir = labeled_save_dir
        # self.source_save_dir = os.path.join(labeled_save_dir, 'test_cs_unlabeled_data_combined.txt')
        # self.target_save_dir = os.path.join(labeled_save_dir, 'test_cs_labels_combined.txt')
        # self.vocab_save_dir = os.path.join(labeled_save_dir, 'test_cs_vocab_combined')

        label_config = util.load_from_dump(
            os.path.join(cnn_model_path, 'flags.cPickle'))
        label_config['train_dir'] = cnn_model_path
        _, predicted_label = label(self.unlabeled_data, config=label_config)

        assert predicted_label.shape[0] == self.data_size

        predicted_label_exp = np.exp(predicted_label)
        predicted_label_softmax = predicted_label_exp / np.sum(
            predicted_label_exp, axis=1, keepdims=True)
        # Entropy = -sum(p * log p) so this is actually the negative of entropy. For sorting purpose I took out the neg.
        predicted_label_entropy = np.sum(np.multiply(
            predicted_label_softmax, np.log(predicted_label_softmax)),
                                         axis=1)

        # The following are ways to rank what question should be asked first.
        # The first one uses entropy, but there might be some implementation errors.
        self.predicted_label_entropy_argsort = np.argsort(
            predicted_label_entropy, axis=0).tolist()

        pass
コード例 #2
0
def main(argv=None):
    restore_param = util.load_from_dump(
        os.path.join(FLAGS.train_dir, 'flags.cPickle'))
    restore_param['train_dir'] = FLAGS.train_dir

    source_path = os.path.join(restore_param['data_dir'],
                               'test_cs_unlabeled_data_combined.txt')
    target_path = os.path.join(restore_param['data_dir'],
                               'test_cs_labels_combined.txt')
    vocab_path = os.path.join(restore_param['data_dir'],
                              'test_cs_vocab_combined')
    data = util.read_data_unlabeled_part(source_path, target_path,
                                         restore_param['sent_len'])

    # # Now hard code to take the first 100
    # data = data[:100]
    # data = data[-1000:]

    x_input, actual_output = label(data, restore_param)

    actual_output_exp = np.exp(actual_output)
    actual_output_softmax = actual_output_exp / np.sum(
        actual_output_exp, axis=1, keepdims=True)
    actual_output_argmax = np.argmax(actual_output_softmax, axis=1)
    actual_output_softmax_sorted = np.argsort(
        -np.max(actual_output_softmax[..., :2], axis=1)).tolist()

    sentence_indices_input = x_input[:, :-2]
    _, rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path)
    sentence_input = preprocessing_util.indices_to_sentences(
        sentence_indices_input, rev_vocab)

    kp_indices_input = x_input[:, -2:]

    print('Type\tSentence\t\tProbability [A is-a B, B is-a A, Neither]')
    # for sentence_i, sentence in enumerate(sentence_input):

    for sentence_i in actual_output_softmax_sorted:
        sentence = sentence_input[sentence_i]
        # Label the key phrases of interest in the current sentence with *.
        sentence[kp_indices_input[sentence_i, 1]] += '*'
        sentence[kp_indices_input[sentence_i, 0]] += '*'
        if actual_output_argmax[sentence_i] == 2:
            # current_type = 'Neither'
            break
        if actual_output_argmax[sentence_i] == 0:
            current_type = 'A is-a B'
        elif actual_output_argmax[sentence_i] == 1:
            current_type = 'B is-a A'

        print('%s\t%s\t\t%s\t' % (current_type, ' '.join(sentence),
                                  str(actual_output_softmax[sentence_i])))
コード例 #3
0
ファイル: eval.py プロジェクト: jerryli27/NewMaster
def main(argv=None):
    restore_param = util.load_from_dump(os.path.join(FLAGS.train_dir, 'flags.cPickle'))
    restore_param['train_dir'] = FLAGS.train_dir

    source_path = os.path.join(restore_param['data_dir'], 'test_cs_unlabeled_data_combined.txt')
    target_path = os.path.join(restore_param['data_dir'], 'test_cs_labels_combined.txt')
    vocab_path = os.path.join(restore_param['data_dir'], 'test_cs_vocab_combined')
    _, data = util.read_data(source_path, target_path, restore_param['sent_len'],
                             train_size=restore_param['train_size'], hide_key_phrases=restore_param.get('hide_key_phrases', False))

    pre, rec, x_input, expected_output, actual_output = evaluate(data, restore_param)

    actual_output_exp = np.exp(actual_output)
    actual_output_softmax = actual_output_exp / np.sum(actual_output_exp, axis=1, keepdims=True)

    output_difference = np.sum(np.abs(actual_output_softmax - expected_output), axis=1)


    sentence_indices_input = x_input[:,:-2]
    _,rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path)
    sentence_input = preprocessing_util.indices_to_sentences(sentence_indices_input,rev_vocab)

    kp_indices_input = x_input[:,-2:]

    print('Diff\tType\tSentence\t\tExpected Score (A is-a B, B is-a A, Neither)\tActual Score')
    for sentence_i, sentence in enumerate(sentence_input):
        # Label the key phrases of interest in the current sentence with *.
        sentence[kp_indices_input[sentence_i,1]] += '*'
        sentence[kp_indices_input[sentence_i,0]] += '*'
        current_type = 'Neither'
        if expected_output[sentence_i,0] == 1:
            current_type = 'A is-a B'
        elif expected_output[sentence_i,1] == 1:
            current_type = 'B is-a A'

        print('%.3f\t%s\t%s\t\t%s\t%s\t'
              % (output_difference[sentence_i], current_type, ' '.join(sentence), str(expected_output[sentence_i]), str(actual_output_softmax[sentence_i])))

    util.dump_to_file(os.path.join(FLAGS.train_dir, 'results.cPickle'), {'precision': pre, 'recall': rec})
コード例 #4
0
def main(argv=None):
    # Flags are defined in train.py
    if FLAGS.hide_key_phrases:
        raise AssertionError(
            "Please turn the hide_key_phrases off for co-training.")

    # First generate cross validation data if it does not exist.

    if not os.path.exists(FLAGS.cross_validation_dir):
        print("Cross validation data folder does not exist. Creating one.")
        os.mkdir(FLAGS.cross_validation_dir)
        source_path = os.path.join(
            FLAGS.data_dir, 'test_cs_unlabeled_data_combined_inferred.txt')
        target_path = os.path.join(FLAGS.data_dir,
                                   'test_cs_labels_combined_inferred.txt')
        cross_validation_split(source_path,
                               target_path,
                               FLAGS.cross_validation_dir,
                               fold_number=FLAGS.cross_validation_fold)

    for cross_val_round_i in range(FLAGS.cross_validation_fold):

        if not os.path.exists(FLAGS.train_dir):
            os.mkdir(FLAGS.train_dir)
        latest_sentence_checkpoint_dir = None
        latest_pair_checkpoint_dir = None
        latest_checkpoint_dir = None
        used_unlabeled_kp_pair_set = set()
        # The validation set is separate from the test and training set from the very beginning.
        val_source_path = os.path.join(
            FLAGS.cross_validation_dir,
            "cross_validation_val_%d_data.txt" % (cross_val_round_i))
        val_target_path = os.path.join(
            FLAGS.cross_validation_dir,
            "cross_validation_val_%d_labels.txt" % (cross_val_round_i))
        val_labeled_data, val_labeled_result = util.read_data_labeled_part(
            val_source_path, val_target_path, FLAGS.sent_len, shuffle=False)
        # For legacy code reasons, I have to add a None column to the training data...
        val_data = np.array(
            zip(val_labeled_data, val_labeled_result,
                [None] * val_labeled_result.shape[0]))
        val_precision = []
        val_recall = []
        val_pr_auc = []  # Precision recall area under the curve.

        for round_i in range(FLAGS.max_co_training_rounds):
            # load dataset

            if round_i == 0:

                source_path = os.path.join(
                    FLAGS.cross_validation_dir,
                    "cross_validation_train_%d_data.txt" % (cross_val_round_i))
                target_path = os.path.join(
                    FLAGS.cross_validation_dir,
                    "cross_validation_train_%d_labels.txt" %
                    (cross_val_round_i))
                # source_path = os.path.join(FLAGS.data_dir, 'test_cs_unlabeled_data_combined_inferred_train.txt')
                # target_path = os.path.join(FLAGS.data_dir, 'test_cs_labels_combined_inferred_train.txt')
            else:
                source_path = os.path.join(
                    latest_checkpoint_dir,
                    'test_cs_unlabeled_data_combined_round_%d.txt' % (round_i))
                target_path = os.path.join(
                    latest_checkpoint_dir,
                    'test_cs_labels_combined_round_%d.txt' % (round_i))
            train_data, test_data = util.read_data(source_path,
                                                   target_path,
                                                   FLAGS.sent_len,
                                                   attention_path=None,
                                                   train_size=FLAGS.train_size,
                                                   hide_key_phrases=False)
            # I probably need to implement getting all the sentences with the same kp here as well?
            train_data_hide_kp, test_data_hide_kp = util.read_data(
                source_path,
                target_path,
                FLAGS.sent_len,
                attention_path=None,
                train_size=FLAGS.train_size,
                hide_key_phrases=True)

            print("Round %d. Reading labeled data from previous round." %
                  (round_i))
            labeled_data, labeled_result = util.read_data_labeled_part(
                source_path, target_path, FLAGS.sent_len, shuffle=False)
            unlabeled_data = util.read_data_unlabeled_part(
                source_path,
                target_path,
                FLAGS.sent_len,
                shuffle=False,
                hide_key_phrases=False)
            unlabeled_data_hide_kp = util.read_data_unlabeled_part(
                source_path,
                target_path,
                FLAGS.sent_len,
                shuffle=False,
                hide_key_phrases=True)

            # For each round, we draw a fresh set of unlabeled data and label them using the trained classifier.
            current_unlabeled_data, used_unlabeled_kp_pair_set, current_drawn_indices = draw_from_unused_unlabeled(
                unlabeled_data, used_unlabeled_kp_pair_set,
                FLAGS.test_size_per_round)
            current_unlabeled_data_hide_kp = [
                unlabeled_data_hide_kp[i] for i in current_drawn_indices
            ]
            # Currently this one works, but we need a version that throws away used ones. So we need to keep track of which
            # ones we've used.
            # current_unlabeled_data, current_drawn_indices = draw_from_unlabeled(unlabeled_data,
            #                                                                                  FLAGS.test_size_per_round)
            # current_unlabeled_data_hide_kp = [unlabeled_data_hide_kp[i] for i in current_drawn_indices]

            additional_label_index = []
            additional_label_result = []

            for classifier_i in range(2):
                additional_label_index.append([])
                additional_label_result.append([])
                if _is_sentence_train(classifier_i):
                    train.train(train_data_hide_kp, test_data_hide_kp)
                    latest_sentence_checkpoint_dir = util.get_latest_checkpoint_dir(
                        FLAGS.train_dir)
                else:
                    train_kp_pair_classifier.train(train_data, test_data)
                    latest_pair_checkpoint_dir = util.get_latest_checkpoint_dir(
                        FLAGS.train_dir)

                # Refresh the latest checkpoint.
                latest_checkpoint_dir = util.get_latest_checkpoint_dir(
                    FLAGS.train_dir)
                restore_param = util.load_from_dump(
                    os.path.join(latest_checkpoint_dir, 'flags.cPickle'))
                restore_param['train_dir'] = latest_checkpoint_dir
                if _is_sentence_train(classifier_i):
                    x_input, actual_output = label.label(
                        current_unlabeled_data_hide_kp, restore_param)
                else:
                    x_input, actual_output = train_kp_pair_classifier.label(
                        current_unlabeled_data, restore_param)

                actual_output_exp = np.exp(actual_output)
                actual_output_softmax = actual_output_exp / np.sum(
                    actual_output_exp, axis=1, keepdims=True)
                actual_output_argmax = np.argmax(actual_output_softmax, axis=1)
                # If we do not want "Neither" relation, then calculate max on only the first 2 dimensions.
                # sentence_i_list = np.argsort(-np.max(actual_output_softmax[..., :2], axis=1)).tolist()
                if FLAGS.use_product_method:
                    sentence_i_list = range(actual_output_softmax.shape[0])
                else:
                    sentence_i_list = np.argsort(
                        -np.max(actual_output_softmax, axis=1)).tolist()

                # We need the version with key phrases not replaced in order to print things correctly.
                sentence_indices_input = current_unlabeled_data[:, :-2]
                vocab_path = os.path.join(restore_param['data_dir'],
                                          'test_cs_vocab_combined')
                _, rev_vocab = preprocessing_util.initialize_vocabulary(
                    vocab_path)
                sentence_input = preprocessing_util.indices_to_sentences(
                    sentence_indices_input, rev_vocab, ignore_pad=True)

                kp_indices_input = current_unlabeled_data[:, -2:]

                with open(
                        os.path.join(latest_checkpoint_dir,
                                     'added_instances.tsv'),
                        "w") as inferred_instances_f:

                    inferred_instances_f.write(
                        'Type\tSentence\t\tProbability [A is-a B, B is-a A, Neither]\n'
                    )
                    additional_label_num_positive = 0
                    additional_label_num_negative = 0
                    for sentence_i in sentence_i_list:
                        # # This is the current max probability
                        # current_softmax = actual_output_softmax[sentence_i,actual_output_argmax[sentence_i]]
                        sentence = sentence_input[sentence_i]
                        # Label the key phrases of interest in the current sentence with *.
                        sentence[kp_indices_input[sentence_i, 1]] += '*'
                        sentence[kp_indices_input[sentence_i, 0]] += '*'
                        if actual_output_argmax[sentence_i] == 2:
                            current_type = 'Neither'
                            if not FLAGS.use_product_method and additional_label_num_negative >= FLAGS.co_training_has_relation_num_label_negative:
                                continue
                            else:
                                additional_label_num_negative += 1
                        if actual_output_argmax[sentence_i] == 0:
                            current_type = 'A is-a B'
                            if not FLAGS.use_product_method and additional_label_num_positive >= FLAGS.co_training_has_relation_num_label_positive:
                                continue
                            else:
                                additional_label_num_positive += 1
                        elif actual_output_argmax[sentence_i] == 1:
                            current_type = 'B is-a A'
                            if not FLAGS.use_product_method and additional_label_num_positive >= FLAGS.co_training_has_relation_num_label_positive:
                                continue
                            else:
                                additional_label_num_positive += 1

                        inferred_instances_f.write(
                            '%s\t%s\t\t%s\n' %
                            (current_type, ' '.join(sentence),
                             str(actual_output_softmax[sentence_i])))

                        if not FLAGS.use_product_method:
                            additional_label_index[classifier_i].append(
                                sentence_i)
                            # If use_product_method is off, then the result is the label.
                            current_additional_label_result = np.zeros((3, ))
                            current_additional_label_result[
                                actual_output_argmax[sentence_i]] = 1
                            additional_label_result[classifier_i].append(
                                current_additional_label_result)
                            if additional_label_num_positive >= FLAGS.co_training_has_relation_num_label_positive and \
                                additional_label_num_negative >= FLAGS.co_training_has_relation_num_label_negative:
                                break
                        else:
                            # If use_product_method is on, then the result is the output softmax, i.e. probability.
                            current_additional_label_result = actual_output_softmax[
                                sentence_i]
                            additional_label_result[classifier_i].append(
                                current_additional_label_result)

                print(
                    "Number of additional data points added through co-training classifier %d"
                    ": %d positives and %d negatives out of %d unlabeled instances."
                    % (classifier_i, additional_label_num_positive,
                       additional_label_num_negative, len(sentence_i_list)))

            # Check if there are any conflicts and merge the additional labels labeled by the two classifier.
            if not FLAGS.use_product_method:
                merged_additional_label_index, merged_additional_label_result = check_conflict_and_merge(
                    additional_label_index, additional_label_result)
            else:
                merged_additional_label_index, merged_additional_label_result = compute_product_and_save(
                    additional_label_result, latest_checkpoint_dir,
                    sentence_input, kp_indices_input)

            latest_checkpoint_dir = util.get_latest_checkpoint_dir(
                FLAGS.train_dir)
            save_source_path = os.path.join(
                latest_checkpoint_dir,
                'test_cs_unlabeled_data_combined_round_%d.txt' % (round_i + 1))
            save_target_path = os.path.join(
                latest_checkpoint_dir,
                'test_cs_labels_combined_round_%d.txt' % (round_i + 1))
            # Now recover the original index in the unlabeled data.
            merged_additional_label_index = [
                current_drawn_indices[i] for i in merged_additional_label_index
            ]
            # Save the additionally labeled 2p+2n examples.
            save_additional_label(unlabeled_data,
                                  merged_additional_label_index,
                                  merged_additional_label_result, labeled_data,
                                  labeled_result, save_source_path,
                                  save_target_path)

            # I also need to get rid of those inferred instances from the whole bag of unlabeled dataset that we're drawing
            # from at each round.
            before_inference_unlabeled_data = util.read_data_unlabeled_part(
                save_source_path,
                save_target_path,
                FLAGS.sent_len,
                shuffle=False)
            inferred_additional_label_index, inferred_additional_label_result = infer_from_labeled(
                save_source_path,
                save_target_path,
                FLAGS.sent_len,
                vocab_path,
                do_save=True,
                save_source_path=save_source_path,
                save_target_path=save_target_path)
            inferred_additional_data = before_inference_unlabeled_data[
                inferred_additional_label_index]
            inferred_additional_sentence_index = inferred_additional_data[:, :
                                                                          -2]
            inferred_additional_kp_index = inferred_additional_data[:, -2:]
            inferred_additional_sentence_input = preprocessing_util.indices_to_sentences(
                inferred_additional_sentence_index, rev_vocab, ignore_pad=True)

            inferred_additional_label_result_argmax = np.argmax(
                inferred_additional_label_result, axis=1)
            with open(
                    os.path.join(latest_checkpoint_dir,
                                 'inferred_instances.tsv'),
                    "w") as inferred_instances_f:
                inferred_instances_f.write('Type\tSentence\n')

                for sentence_i in range(inferred_additional_kp_index.shape[0]):
                    # # This is the current max probability
                    # current_softmax = actual_output_softmax[sentence_i,actual_output_argmax[sentence_i]]
                    sentence = inferred_additional_sentence_input[sentence_i]
                    # Label the key phrases of interest in the current sentence with *.
                    sentence[inferred_additional_kp_index[sentence_i,
                                                          1]] += '*'
                    sentence[inferred_additional_kp_index[sentence_i,
                                                          0]] += '*'
                    if inferred_additional_label_result_argmax[
                            sentence_i] == 2:
                        current_type = 'Neither'
                    if inferred_additional_label_result_argmax[
                            sentence_i] == 0:
                        current_type = 'A is-a B'
                    elif inferred_additional_label_result_argmax[
                            sentence_i] == 1:
                        current_type = 'B is-a A'
                    inferred_instances_f.write(
                        '%s\t%s\n' % (current_type, ' '.join(sentence)))

            # Now all is left is to use the validation dataset to calculate the area under precision recall curve.
            val_precision.append([[[] for _ in range(3)] for _ in range(3)])
            val_recall.append([[[] for _ in range(3)] for _ in range(3)])
            val_pr_auc.append([[0.0, 0.0, 0.0] for _ in range(3)])
            # Each time we calculate the precision recall for classifier 1, 2, and combined.
            for classifier_j in range(3):
                if classifier_j == 0:
                    # Use classifier 1.
                    restore_param = util.load_from_dump(
                        os.path.join(latest_sentence_checkpoint_dir,
                                     'flags.cPickle'))
                    restore_param['train_dir'] = latest_sentence_checkpoint_dir
                    _, val_actual_output = label.label(val_labeled_data,
                                                       restore_param)
                elif classifier_j == 1:
                    # Use classifier 2.
                    restore_param = util.load_from_dump(
                        os.path.join(latest_pair_checkpoint_dir,
                                     'flags.cPickle'))
                    restore_param['train_dir'] = latest_pair_checkpoint_dir
                    _, val_actual_output = train_kp_pair_classifier.label(
                        val_labeled_data, restore_param)
                else:
                    # Use both classifier and, due to design choice of caring more about precision than recall, label
                    # an instance as having a subcategory relation only when both classifier agrees, otherwise output
                    # no relation, aka `Neither`.
                    restore_param = util.load_from_dump(
                        os.path.join(latest_sentence_checkpoint_dir,
                                     'flags.cPickle'))
                    restore_param['train_dir'] = latest_sentence_checkpoint_dir
                    _, val_actual_output_sentence = label.label(
                        val_labeled_data, restore_param)
                    restore_param = util.load_from_dump(
                        os.path.join(latest_pair_checkpoint_dir,
                                     'flags.cPickle'))
                    restore_param['train_dir'] = latest_pair_checkpoint_dir
                    _, val_actual_output_pair = train_kp_pair_classifier.label(
                        val_labeled_data, restore_param)
                    val_actual_output_sentence_argmax = np.argmax(
                        val_actual_output_sentence, axis=1)
                    val_actual_output_pair_argmax = np.argmax(
                        val_actual_output_pair, axis=1)

                    # Label the actual output as [1,0,0] if both classify as A is B, [0,1,0] if both classify as B is A,
                    # and [0,0,1] in all other situations.
                    val_actual_output = np.array([[
                        1 if k == val_actual_output_sentence_argmax[j] else 0
                        for k in range(3)
                    ] if np.all(
                        val_actual_output_sentence_argmax[j] ==
                        val_actual_output_pair_argmax[j]) else [
                            0, 0, 1
                        ] for j in range(val_actual_output_sentence.shape[0])])

                val_actual_output_exp = np.exp(val_actual_output)
                val_actual_output_softmax = val_actual_output_exp / np.sum(
                    val_actual_output_exp, axis=1, keepdims=True)
                for i in range(3):
                    val_precision[round_i][classifier_j][i], val_recall[
                        round_i][classifier_j][i], _ = precision_recall_curve(
                            val_labeled_result[:, i],
                            val_actual_output_softmax[:, i])
                    val_pr_auc[round_i][classifier_j][
                        i] = average_precision_score(
                            val_labeled_result[:, i],
                            val_actual_output_softmax[:, i],
                        )

        # Lastly output the precision recall file for each classifier and each category.
        with open(os.path.join(latest_checkpoint_dir, 'pr_auc.tsv'), "w") as f:
            for classifier_j in range(3):
                for i in range(3):
                    f.write(
                        "Classifier%d_%s\t%s\n" %
                        (classifier_j, CATEGORY_NAME[i], "\t".join([
                            str(val_pr_auc[round_i][classifier_j][i])
                            for round_i in range(FLAGS.max_co_training_rounds)
                        ])))

        np.save(os.path.join(latest_checkpoint_dir, 'precision_recall_data'),
                np.array([val_precision, val_recall, val_pr_auc]))
コード例 #5
0
def main(argv=None):
    restore_param = util.load_from_dump(os.path.join(FLAGS.train_dir, 'flags.cPickle'))
    restore_param['train_dir'] = FLAGS.train_dir

    source_path = os.path.join(restore_param['data_dir'], 'test_cs_unlabeled_data_combined.txt')
    target_path = os.path.join(restore_param['data_dir'], 'test_cs_labels_combined.txt')
    vocab_path = os.path.join(restore_param['data_dir'], 'test_cs_vocab_combined')
    unlabeled_data = util.read_data_unlabeled_part(source_path, target_path, restore_param['sent_len'])
    data_size = unlabeled_data.shape[0]

    # # Now hard code to take the first 1000
    # data_first_1000 = unlabeled_data

    x_input, actual_output = label(unlabeled_data, restore_param)

    actual_output_exp = np.exp(actual_output)
    actual_output_softmax = actual_output_exp / np.sum(actual_output_exp, axis=1, keepdims=True)
    actual_output_argmax = np.argmax(actual_output_softmax,axis=1)
    # Entropy = -sum(p * log p) so this is actually the negative of entropy. For sorting purpose I took out the neg.
    actual_output_entropy = np.sum(np.multiply(actual_output_softmax, np.log(actual_output_softmax)), axis=1)

    # The following are ways to rank what question should be asked first.
    # The first one uses entropy, but there might be some implementation errors.
    actual_output_entropy_argsort = np.argsort(actual_output_entropy, axis=0) # This doesn:t seem to give me the most uncertain ones??? in theory it does. or maybe it's just the model is too sure of everything.
    # The second one uses the softmax probability and only ask the one with highest probability in the first two
    # classes.
    # actual_output_entropy_argsort = np.argsort(-np.max(actual_output_softmax[...,:2], axis=1))

    sentence_indices_input = x_input[:,:-2]
    _,rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path)
    sentence_input = preprocessing_util.indices_to_sentences(sentence_indices_input,rev_vocab)

    kp_indices_input = x_input[:,-2:]
    #
    # print('Sentence\t\tPredicted Score (A is-a B, B is-a A, Neither)\t')
    # for sentence_i, sentence in enumerate(sentence_input):
    #     # Label the key phrases of interest in the current sentence with *.
    #     sentence[kp_indices_input[sentence_i,1]] += '*'
    #     sentence[kp_indices_input[sentence_i,0]] += '*'
    #     if actual_output_argmax[sentence_i] == 2:
    #         # current_type = 'Neither'
    #         continue
    #     if actual_output_argmax[sentence_i] == 0:
    #         current_type = 'A is-a B'
    #     elif actual_output_argmax[sentence_i] == 1:
    #         current_type = 'B is-a A'
    #
    #     print('%s\t%s\t\t%s\t'
    #           % (current_type, ' '.join(sentence), str(actual_output_softmax[sentence_i])))
    user_input = -1
    num_user_labeled = 0
    user_label_results = []
    while user_input != 4 and num_user_labeled < data_size:
        sentence_i = actual_output_entropy_argsort[num_user_labeled]
        sentence = sentence_input[sentence_i]
        print('Key phrase pair\tSentence\t\tPredicted Score (A is-a B, B is-a A, Neither)\t')

        current_key_phrase_pair = sentence[kp_indices_input[sentence_i,0]] + ' ' + sentence[kp_indices_input[sentence_i,1]]
        # Label the key phrases of interest in the current sentence with *.
        sentence[kp_indices_input[sentence_i,1]] += '*'
        sentence[kp_indices_input[sentence_i,0]] += '*'
        print('%s\n%s\t\t%s\t'
              % (current_key_phrase_pair,' '.join(sentence), str(actual_output_softmax[sentence_i])))
        user_input = raw_input('In your opinion, what should be the category of the key phrase pair? '
                                   'Please enter 1, 2, or 3. Enter 4 to stop answering.\n'
                                   '1. A is-a B\n2. B is-a A\n3. Neither.')
        user_input = util.get_valid_user_input(user_input, 1, 4)

        if user_input != 4:
            user_label_result = np.array([0,0,0])
            user_label_result[user_input-1] = 1
            user_label_results.append(user_label_result)
            num_user_labeled += 1

    actual_output_entropy_indices = actual_output_entropy_argsort[:num_user_labeled]

    if len(user_label_results) > 0:
        labeled_data, labeled_result = util.read_data_labeled_part(source_path, target_path, restore_param['sent_len'], shuffle=False)
        user_label_results = np.array(user_label_results)
        save_additional_label(unlabeled_data, actual_output_entropy_indices, user_label_results,labeled_data,labeled_result, source_path, target_path)
コード例 #6
0
def main(argv):
    restore_param = util.load_from_dump(
        os.path.join(FLAGS.train_dir, 'flags.cPickle'))
    restore_param['train_dir'] = FLAGS.train_dir
    if argv is not None:
        source_path = argv[1]
        target_path = argv[2]
    if source_path is None:
        source_path = os.path.join(restore_param['data_dir'],
                                   'test_cs_unlabeled_data_combined.txt')
    if target_path is None:
        target_path = os.path.join(restore_param['data_dir'],
                                   'test_cs_labels_combined.txt')
    vocab_path = os.path.join(restore_param['data_dir'],
                              'test_cs_vocab_combined')

    labeled_data, labeled_result = util.read_data_labeled_part(
        source_path, target_path, restore_param['sent_len'], shuffle=False)
    labeled_data = np.array(labeled_data)
    labeled_result = np.array(labeled_result)
    data_size = labeled_data.shape[0]

    # # Now hard code to take the first 1000
    # data_first_1000 = unlabeled_data

    sentence_indices_input = labeled_data[:, :-2]
    _, rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path)
    sentence_input = preprocessing_util.indices_to_sentences(
        sentence_indices_input, rev_vocab)

    kp_indices_input = labeled_data[:, -2:]
    #
    # print('Sentence\t\tPredicted Score (A is-a B, B is-a A, Neither)\t')
    # for sentence_i, sentence in enumerate(sentence_input):
    #     # Label the key phrases of interest in the current sentence with *.
    #     sentence[kp_indices_input[sentence_i,1]] += '*'
    #     sentence[kp_indices_input[sentence_i,0]] += '*'
    #     if actual_output_argmax[sentence_i] == 2:
    #         # current_type = 'Neither'
    #         continue
    #     if actual_output_argmax[sentence_i] == 0:
    #         current_type = 'A is-a B'
    #     elif actual_output_argmax[sentence_i] == 1:
    #         current_type = 'B is-a A'
    #
    #     print('%s\t%s\t\t%s\t'
    #           % (current_type, ' '.join(sentence), str(actual_output_softmax[sentence_i])))

    with open(os.path.join(FLAGS.train_dir, 'labeled_dataset_human.csv'),
              'w') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow([
            'Key phrase pair(separated by one space)',
            'Sentence(Key phrase labeled with *)', '(Label)A is-a B',
            'B is-a A', 'Neither'
        ])
        for sentence_i in range(data_size):
            sentence = sentence_input[sentence_i]

            current_key_phrase_pair = sentence[kp_indices_input[
                sentence_i, 0]] + ' ' + sentence[kp_indices_input[sentence_i,
                                                                  1]]
            # Label the key phrases of interest in the current sentence with *.
            sentence[kp_indices_input[sentence_i, 1]] += '*'
            sentence[kp_indices_input[sentence_i, 0]] += '*'
            csv_writer.writerow([current_key_phrase_pair, ' '.join(sentence)] +
                                labeled_result[sentence_i, ...].tolist())