Beispiel #1
0
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    # Initialize arrays to store results.
    phrase_acc = np.zeros([max_samples_batch+1])
    out_acc = np.zeros([max_samples_batch+1])
    label_count = np.zeros([max_samples_batch+1])
    pseudo_acc = np.zeros([max_samples_batch+1])

    # Define training set and testing set and corresponding original strings.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set and the training pool (unlabeled data).
    initial_size = 2
    train_set_current = train_set[:initial_size]
    train_set_new = train_set[initial_size:]
    train_string_current = train_string[:initial_size]
    train_string_new = train_string[initial_size:]
    count = 0
    for i in range(initial_size):
        count += len(train_string[i])

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train_current, y_train_current)

    # Use the estimator.
    y_pred = crf.predict(X_test)
    phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(y_test, y_pred)
    phrase_acc[0] = phrase_correct / phrase_count
    out_acc[0] = out_correct / out_count
    label_count[0] = count
    pseudo_acc[0] = 1   # There is no pseudo-label at the beginning.

    # len_test = len(test_set)
    initial_budget = 100
    if count >= initial_budget:
        print('Error: initial budget is less than initial number of labels.')
    else:
        label_threshold = initial_budget

    for num_training in range(max_samples_batch):

        # Want to look at the confidence (entropy for each character of each string) on unlabeled data.
        label_list = crf.tagger_.labels()
        entropy_list = []
        for i in train_set_new:
            crf.tagger_.set(sent2features(i))
            entropy_seq = []
            len_ptname = len(i)
            for j in range(len_ptname):
                marginal_prob = [crf.tagger_.marginal(k, j) for k in label_list]
                entropy_seq.append(scipy.stats.entropy(marginal_prob))
            entropy_list.append(entropy_seq)

        # Select the string with the largest entropy sum.
        candidate_score = []
        for i in range(len(entropy_list)):
            candidate_score.append(sum(entropy_list[i]))
        sort_idx = np.argmax(candidate_score)

        # Find the sample with minimum confidence and only label the part with low confidence.
        entropy_tmp = entropy_list[sort_idx]
        y_sequence = crf.tagger_.tag(sent2features(train_set_new[sort_idx]))
        mean_entropy_tmp = np.mean(entropy_tmp)
        std_entropy_tmp = np.std(entropy_tmp)
        len_ptname = len(train_set_new[sort_idx])
        z_score = [(entropy_tmp[i] - mean_entropy_tmp) / std_entropy_tmp for i in range(len_ptname)]
        label_index = []
        pseudo_label_total = 0
        pseudo_label_correct = 0
        z_score_sort = np.argsort(z_score, kind='mergesort').tolist()
        for i in range(int(math.ceil(len_ptname / 2.0))):
            label_index.append(z_score_sort[-i - 1])
        if count+len(label_index) <= label_threshold:
            for i in label_index:
                count += 1
                if y_sequence[i] == sent2labels(train_set_new[sort_idx])[i]:
                    pseudo_label_correct += 1
                y_sequence[i] = sent2labels(train_set_new[sort_idx])[i]
                pseudo_label_total += 1
        else:
            label_threshold_tmp = label_threshold - count
            sorted_z_score_index = np.argsort(z_score, kind='mergesort').tolist()
            for i in range(label_threshold_tmp):
                count += 1
                if y_sequence[sorted_z_score_index[-i-1]] == sent2labels(
                        train_set_new[sort_idx])[sorted_z_score_index[-i-1]]:
                    pseudo_label_correct += 1
                y_sequence[sorted_z_score_index[-i-1]] = sent2labels(
                    train_set_new[sort_idx])[sorted_z_score_index[-i-1]]
                pseudo_label_total += 1
        if count == label_threshold:
            label_threshold = label_threshold + 50
        label_count[num_training+1] = count
        if pseudo_label_total != 0:
            pseudo_acc[num_training + 1] = pseudo_label_correct/pseudo_label_total
        else:
            pseudo_acc[num_training + 1] = 1

        # Update training set.
        # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]]
        sample_to_remove = [train_set_new[sort_idx]]
        for i in sample_to_remove:
            train_set_current.append(i)
            train_set_new.remove(i)
            X_train_current.append(sent2features(i))
            # print(X_train_current)
            y_train_current.append(y_sequence)
        # string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]]
        string_to_remove = [train_string_new[sort_idx]]
        for i in string_to_remove:
            train_string_current.append(i)
            train_string_new.remove(i)

        # Train the CRF.
        # crf = sklearn_crfsuite.CRF(
        #     algorithm='lbfgs',
        #     c1=0.1,
        #     c2=0.1,
        #     max_iterations=100,
        #     all_possible_transitions=True
        # )
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(y_test, y_pred)
        # print(phrase_count, phrase_correct, out_count, out_correct)
        phrase_acc[num_training+1] = phrase_correct / phrase_count
        out_acc[num_training+1] = out_correct / out_count

    return phrase_acc, out_acc, label_count, pseudo_acc
Beispiel #2
0
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    # Initialize arrays to store results.
    phrase_acc = np.zeros([max_samples_batch + 1])
    out_acc = np.zeros([max_samples_batch + 1])
    label_count = np.zeros([max_samples_batch + 1])
    pseudo_acc = np.zeros([max_samples_batch + 1])

    # Define training set and testing set and corresponding original strings.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set and the training pool (unlabeled data).
    initial_size = 2
    train_set_current = train_set[:initial_size]
    train_set_new = train_set[initial_size:]
    train_string_current = train_string[:initial_size]
    train_string_new = train_string[initial_size:]
    count = 0
    for i in range(initial_size):
        count += len(train_string[i])

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train_current, y_train_current)

    # Use the estimator.
    y_pred = crf.predict(X_test)
    phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
        y_test, y_pred)
    phrase_acc[0] = phrase_correct / phrase_count
    out_acc[0] = out_correct / out_count
    label_count[0] = count
    pseudo_acc[0] = 1  # There is no pseudo-label at the beginning.

    # Vectorized and clustered test set.
    num_cluster = 5
    total_string = test_string[:]
    total_string.extend(train_string_new)
    vec, _ = utils.string_vectorize(total_string)
    test_vec = vec[:len(test_string)]
    train_new_vec = vec[len(test_string):].tolist()
    kmeans = KMeans(n_clusters=num_cluster, random_state=0).fit(test_vec)
    cluster_centers = kmeans.cluster_centers_
    cluster_labels = kmeans.labels_

    # Calculate cluster size.
    cluster_size = np.zeros(num_cluster)
    for i in cluster_labels:
        cluster_size[i] += 1
    largest_cluster = np.argmax(cluster_size)
    weight_cluster = [i / sum(cluster_size) for i in cluster_size]

    # Calculate the representative of each test sample by distance to its corresponding cluster center.
    len_test = len(test_set)
    dist_list = np.zeros(len_test)
    for i in range(len_test):
        dist_list[i] = np.linalg.norm(test_vec[i] -
                                      cluster_centers[cluster_labels[i]])

    # Weighted distance to cluster centers for each unlabeled instance.
    distance_to_cluster = []
    for i in range(len(train_new_vec)):
        weighted_distance = [
            weight_cluster[j] *
            np.linalg.norm(train_new_vec[i] - cluster_centers[j])
            for j in range(num_cluster)
        ]
        distance_to_cluster.append(sum(weighted_distance))

    # len_test = len(test_set)
    initial_budget = 100
    if count >= initial_budget:
        print('Error: initial budget is less than initial number of labels.')
    else:
        label_threshold = initial_budget

    for num_training in range(max_samples_batch):

        # Want to look at the model confidence using entropy.
        # Calculate entropy for each character of each string in the test set.
        label_list = crf.tagger_.labels()
        entropy_list = []
        for i in test_set:
            len_ptname = len(i)
            crf.tagger_.set(sent2features(i))
            entropy_seq = []
            for j in range(len_ptname):
                marginal_prob = [
                    crf.tagger_.marginal(k, j) for k in label_list
                ]
                entropy_seq.append(scipy.stats.entropy(marginal_prob))
            entropy_list.append(entropy_seq)

        # Sort the test set based on the average entropy (previously entropy sum).
        entropy_sum = [sum(i) / len(i) for i in entropy_list]
        sort_idx_temp = np.argsort(-np.array(entropy_sum),
                                   kind='mergesort').tolist()

        # Select the string with the minimum average distance to the selected group.
        temp_set = [test_string[i] for i in sort_idx_temp[:3]]
        distance = utils.avr_edit_distance(temp_set, train_string_new, True)
        # sort_idx = np.argsort(distance, kind='mergesort').tolist()
        sort_idx = np.argmin(distance)

        # Find the sample with the maximal score and only label the part with low confidence/high entropy.
        y_sequence = crf.tagger_.tag(sent2features(
            train_set_new[sort_idx]))  # generate pseudo-label firstly
        entropy_tmp = []
        len_ptname = len(train_set_new[sort_idx])
        for j in range(len_ptname):
            marginal_prob = [crf.tagger_.marginal(k, j) for k in label_list]
            entropy_tmp.append(scipy.stats.entropy(marginal_prob))
        mean_entropy_tmp = np.mean(entropy_tmp)
        std_entropy_tmp = np.std(entropy_tmp)
        z_score = [(entropy_tmp[i] - mean_entropy_tmp) / std_entropy_tmp
                   for i in range(len_ptname)]
        y_sequence_truth = sent2labels(train_set_new[sort_idx])
        # print(entropy_tmp, z_score, y_sequence, y_sequence_truth)
        label_index = []
        pseudo_label_total = 0
        pseudo_label_correct = 0
        z_score_sort = np.argsort(z_score, kind='mergesort').tolist()
        for i in range(int(math.ceil(len_ptname / 2.0))):
            label_index.append(z_score_sort[-i - 1])
        if count + len(label_index) <= label_threshold:
            for i in label_index:
                count += 1
                if y_sequence[i] == sent2labels(train_set_new[sort_idx])[i]:
                    pseudo_label_correct += 1
                y_sequence[i] = sent2labels(train_set_new[sort_idx])[i]
                pseudo_label_total += 1
        else:
            label_threshold_tmp = label_threshold - count
            sorted_z_score_index = np.argsort(z_score,
                                              kind='mergesort').tolist()
            for i in range(label_threshold_tmp):
                count += 1
                if y_sequence[sorted_z_score_index[-i - 1]] == sent2labels(
                        train_set_new[sort_idx])[sorted_z_score_index[-i - 1]]:
                    pseudo_label_correct += 1
                y_sequence[sorted_z_score_index[-i - 1]] = sent2labels(
                    train_set_new[sort_idx])[sorted_z_score_index[-i - 1]]
                pseudo_label_total += 1
        if count == label_threshold:
            label_threshold = label_threshold + 50
        label_count[num_training + 1] = count
        if pseudo_label_total != 0:
            pseudo_acc[num_training +
                       1] = pseudo_label_correct / pseudo_label_total
        else:
            pseudo_acc[num_training + 1] = 1

        # Update training set.
        # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]]
        sample_to_remove = [train_set_new[sort_idx]]
        for i in sample_to_remove:
            train_set_current.append(i)
            train_set_new.remove(i)
            X_train_current.append(sent2features(i))
            y_train_current.append(y_sequence)
            # print(X_train_current)
        # string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]]
        string_to_remove = [train_string_new[sort_idx]]
        for i in string_to_remove:
            train_string_current.append(i)
            train_string_new.remove(i)
        # Remove the pre-calculate vectors and distances.
        del train_new_vec[sort_idx]
        del distance_to_cluster[sort_idx]

        # # define fixed parameters and parameters to search
        # crf = sklearn_crfsuite.CRF(
        #     algorithm='lbfgs',
        #     max_iterations=100,
        #     all_possible_transitions=True
        # )
        # params_space = {
        #     'c1': scipy.stats.expon(scale=0.5),
        #     'c2': scipy.stats.expon(scale=0.05),
        # }
        #
        # # search
        # rs = RandomizedSearchCV(crf, params_space,
        #                         cv=2,
        #                         verbose=1,
        #                         n_jobs=-1,
        #                         n_iter=5)
        # rs.fit(X_train_current, y_train_current)
        #
        # print('best params:', rs.best_params_)
        # print('best CV score:', rs.best_score_)
        # crf = rs.best_estimator_

        # Train the CRF.
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.1,
                                   c2=0.1,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
            y_test, y_pred)
        phrase_acc[num_training + 1] = phrase_correct / phrase_count
        out_acc[num_training + 1] = out_correct / out_count

    return phrase_acc, out_acc, label_count, pseudo_acc
Beispiel #3
0
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    # Initialize arrays to store results.
    phrase_acc = np.zeros([max_samples_batch + 1])
    out_acc = np.zeros([max_samples_batch + 1])
    label_count = np.zeros([max_samples_batch + 1])
    pseudo_acc = np.zeros([max_samples_batch + 1])

    # Define training set and testing set and corresponding original strings.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set and the training pool (unlabeled data).
    initial_size = 10
    train_set_current = train_set[:initial_size]
    train_set_new = train_set[initial_size:]
    train_string_current = train_string[:initial_size]
    train_string_new = train_string[initial_size:]
    print(train_string_current)
    count = 0
    for i in range(initial_size):
        count += len(train_string[i])

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train_current, y_train_current)

    # Use the estimator.
    y_pred = crf.predict(X_test)
    phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
        y_test, y_pred)
    phrase_acc[0] = phrase_correct / phrase_count
    out_acc[0] = out_correct / out_count
    label_count[0] = count
    pseudo_acc[0] = 1  # There is no pseudo-label at the beginning.

    # len_test = len(test_set)
    initial_budget = 100
    if count >= initial_budget:
        print('Error: initial budget is less than initial number of labels.')
    else:
        label_threshold = initial_budget

    new_instance_idx = [
    ]  # record the indices of new added instances in the training set
    pseudo_label_idx = []  # record the positions of pseudo labels
    visited_idx = [
    ]  # record the indices of visited instances in the unlabeled set

    for num_training in range(max_samples_batch):

        label_list = crf.tagger_.labels()
        # Want to look at the confidence (average entropy for each character of each string) on unlabeled data.
        entropy_list = []
        for i in range(len(train_set_new)):
            crf.tagger_.set(sent2features(train_set_new[i]))
            entropy_seq = []
            len_ptname = len(train_set_new[i])
            if i in visited_idx:
                revisit_idx_re = visited_idx.index(i)
                unlabeled_part = pseudo_label_idx[revisit_idx_re]
                for j in unlabeled_part:
                    marginal_prob = [
                        crf.tagger_.marginal(k, j) for k in label_list
                    ]
                    entropy_seq.append(scipy.stats.entropy(marginal_prob))
            else:
                for j in range(len_ptname):
                    marginal_prob = [
                        crf.tagger_.marginal(k, j) for k in label_list
                    ]
                    entropy_seq.append(scipy.stats.entropy(marginal_prob))
            entropy_list.append(entropy_seq)

        # Select the string with the largest entropy sum.
        candidate_score = []
        for i in range(len(entropy_list)):
            candidate_score.append(sum(entropy_list[i]) / len(entropy_list[i]))
        sort_idx = np.argmax(candidate_score)

        # # Calculate the confidence on the training pool (train_set_new) using the current CRF.
        # X_train_new = [sent2features(s) for s in train_set_new]
        # len_train_new = len(train_set_new)
        # prob_list = []
        # for i in range(len_train_new):
        #     y_sequence = crf.tagger_.tag(X_train_new[i])
        #     # normalized sequence probability
        #     prob_norm = math.exp(math.log(crf.tagger_.probability(y_sequence)) / len(train_string_new[i]))
        #     prob_list.append(prob_norm)
        #
        # # Sort the training pool based on confidence.
        # sort_idx = np.argsort(np.array(prob_list), kind='mergesort').tolist()
        # sort_idx = sort_idx[0]

        # Check if this is revisiting.
        if sort_idx in visited_idx:
            revisit_flag = True
        else:
            revisit_flag = False

        if revisit_flag:
            revisit_idx_un = sort_idx  # the instance index in the unlabeled set
            revisit_idx_re = visited_idx.index(
                sort_idx)  # the instance index in the tracking record
            revisit_idx_tr = new_instance_idx[
                revisit_idx_re]  # the instance index in the training set
            # Update the pseudo label to manual label in the training set.
            y_train_current[revisit_idx_tr] = sent2labels(
                train_set_current[revisit_idx_tr])
            # Update the unlabeled set.
            del train_set_new[revisit_idx_un]
            del train_string_new[revisit_idx_un]
            # Update the tracking record.
            count += len(pseudo_label_idx[revisit_idx_re])
            del new_instance_idx[revisit_idx_re]
            del pseudo_label_idx[revisit_idx_re]
            del visited_idx[revisit_idx_re]
            for i in range(len(visited_idx)):
                if visited_idx[i] > revisit_idx_un:
                    visited_idx[i] = visited_idx[i] - 1
            label_count[num_training + 1] = count
        else:
            # Apply z-scores to pseudo-label.
            visited_idx.append(sort_idx)
            entropy_tmp = entropy_list[sort_idx]
            len_ptname = len(entropy_tmp)
            y_sequence = crf.tagger_.tag(sent2features(
                train_set_new[sort_idx]))
            mean_entropy_tmp = np.mean(entropy_tmp)
            std_entropy_tmp = np.std(entropy_tmp)
            z_score = [(entropy_tmp[i] - mean_entropy_tmp) / std_entropy_tmp
                       for i in range(len_ptname)]
            label_index = []
            for i in range(len_ptname):
                if z_score[i] > 0.0:
                    count += 1
                    y_sequence[i] = sent2labels(train_set_new[sort_idx])[i]
                    label_index.append(i)
            pseudo_index = [
                i for i in range(len_ptname) if i not in label_index
            ]
            pseudo_label_idx.append(pseudo_index)
            label_count[num_training + 1] = count

            # Update training set.
            new_instance_idx.append(len(train_string_current))
            train_set_current.append(train_set_new[sort_idx])
            train_string_current.append(train_string_new[sort_idx])
            # X_train_current.append(sent2features(train_set_new[sort_idx]))
            y_train_current.append(y_sequence)
            X_train_current = [sent2features(s) for s in train_set_current]
            # print(train_string_current, y_train_current[-1])
            # del train_set_new[sort_idx]
            # del train_string_new[sort_idx]

        # Update the pseudo labels using the current CRF.
        new_instance_count = 0
        for i in new_instance_idx:
            current_label_seq = y_train_current[i]
            new_pseudo_label_seq = crf.tagger_.tag(X_train_current[i])
            for j in pseudo_label_idx[new_instance_count]:
                current_label_seq[j] = new_pseudo_label_seq[j]
            y_train_current[i] = current_label_seq
            new_instance_count += 1

        # Train the CRF.
        # crf = sklearn_crfsuite.CRF(
        #     algorithm='lbfgs',
        #     c1=0.1,
        #     c2=0.1,
        #     max_iterations=100,
        #     all_possible_transitions=True
        # )
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
            y_test, y_pred)
        # print(phrase_count, phrase_correct, out_count, out_correct)
        phrase_acc[num_training + 1] = phrase_correct / phrase_count
        out_acc[num_training + 1] = out_correct / out_count

    return phrase_acc, out_acc, label_count, pseudo_acc
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    # Initialize arrays to store results.
    phrase_acc = np.zeros([max_samples_batch + 1])
    out_acc = np.zeros([max_samples_batch + 1])
    label_count = np.zeros([max_samples_batch + 1])
    pseudo_acc = np.zeros([max_samples_batch + 1])

    # Define training set and testing set and corresponding original strings.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set and the training pool (unlabeled data).
    initial_size = 10
    train_set_current = train_set[:initial_size]
    train_set_new = train_set[initial_size:]
    train_string_current = train_string[:initial_size]
    train_string_new = train_string[initial_size:]
    count = 0
    for i in range(initial_size):
        count += len(train_string[i])

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train_current, y_train_current)

    # Use the estimator.
    y_pred = crf.predict(X_test)
    phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
        y_test, y_pred)
    phrase_acc[0] = phrase_correct / phrase_count
    out_acc[0] = out_correct / out_count
    label_count[0] = count
    pseudo_acc[0] = 1  # There is no pseudo-label at the beginning.

    # Vectorized and clustered test set.
    total_string = test_string[:]
    total_string.extend(train_string_new)
    vec, _ = utils.string_vectorize(total_string)
    test_vec = vec[:len(test_string)].tolist()
    train_new_vec = vec[len(test_string):].tolist()

    # Pre-calculate similarity.
    # This will be efficient if the number of iterations is large.
    sim_matrix = np.zeros((len(train_new_vec), len(test_vec)))
    for i in range(len(train_new_vec)):
        for j in range(len(test_vec)):
            sim_matrix[i, j] = 1 - spatial.distance.cosine(
                train_new_vec[i], test_vec[j])

    len_test = len(test_set)

    initial_budget = 100
    if count >= initial_budget:
        print('Error: initial budget is less than initial number of labels.')
    else:
        label_threshold = initial_budget

    new_instance_idx = [
    ]  # record the indices of new added instances in the training set
    pseudo_label_idx = []  # record the positions of pseudo labels
    visited_idx = [
    ]  # record the indices of visited instances in the unlabeled set

    for num_training in range(max_samples_batch):

        label_list = crf.tagger_.labels()
        # # Want to look at the model confidence on the test set.
        # entropy_list = []
        # for i in test_set:
        #     len_ptname = len(i)
        #     crf.tagger_.set(sent2features(i))
        #     entropy_seq = []
        #     for j in range(len_ptname):
        #         marginal_prob = [crf.tagger_.marginal(k, j) for k in label_list]
        #         entropy_seq.append(scipy.stats.entropy(marginal_prob))
        #     entropy_list.append(entropy_seq)
        #
        # # Sort the test set based on the average entropy.
        # entropy_sum = [sum(i)/len(i) for i in entropy_list]
        # sort_idx_temp = np.argsort(-np.array(entropy_sum), kind='mergesort').tolist()

        # Calculate the confidence on the test set using the current CRF.
        prob_list = []
        for i in range(len_test):
            # crf.tagger_.set(X_train_new[i])
            y_sequence = crf.tagger_.tag(X_test[i])
            # print(crf.tagger_.probability(y_sequence))
            # normalized sequence probability
            prob_norm = math.exp(
                math.log(crf.tagger_.probability(y_sequence)) /
                len(test_string[i]))
            prob_list.append(prob_norm)

        # Sort the test set based on confidence.
        sort_idx_temp = np.argsort(np.array(prob_list),
                                   kind='mergesort').tolist()

        # Calculate the average similarity between the unlabeled samples and the selected test samples.
        group_size = 1
        avr_sim = np.sum(sim_matrix[:, sort_idx_temp[:group_size]],
                         axis=1) / group_size
        distance = avr_sim

        # We want to have information weighted by such distance.
        entropy_list = []
        len_train_new = len(train_set_new)
        for i in range(len_train_new):
            crf.tagger_.set(sent2features(train_set_new[i]))
            entropy_seq = []
            len_ptname = len(train_set_new[i])
            if i in visited_idx:
                revisit_idx_re = visited_idx.index(i)
                unlabeled_part = pseudo_label_idx[revisit_idx_re]
                for j in unlabeled_part:
                    marginal_prob = [
                        crf.tagger_.marginal(k, j) for k in label_list
                    ]
                    entropy_seq.append(scipy.stats.entropy(marginal_prob))
            else:
                for j in range(len_ptname):
                    marginal_prob = [
                        crf.tagger_.marginal(k, j) for k in label_list
                    ]
                    entropy_seq.append(scipy.stats.entropy(marginal_prob))
            entropy_list.append(entropy_seq)

        entropy_list_mean = []
        for i in range(len(entropy_list)):
            entropy_list_mean.append(
                sum(entropy_list[i]) / len(entropy_list[i]))

        candidate_score = []
        for i in range(len_train_new):
            if distance[i] == 0:
                candidate_score.append(sys.float_info.max)
            else:
                candidate_score.append(entropy_list_mean[i] * distance[i])

        # Obtain the candidate index.
        sort_idx = np.argsort(candidate_score, kind='mergesort').tolist()
        sort_idx.reverse()
        sort_idx = sort_idx[0]

        # Check if this is revisiting.
        if sort_idx in visited_idx:
            revisit_flag = True
        else:
            revisit_flag = False

        if revisit_flag:
            revisit_idx_un = sort_idx  # the instance index in the unlabeled set
            revisit_idx_re = visited_idx.index(
                sort_idx)  # the instance index in the tracking record
            revisit_idx_tr = new_instance_idx[
                revisit_idx_re]  # the instance index in the training set
            # Update the pseudo label to manual label in the training set.
            y_train_current[revisit_idx_tr] = sent2labels(
                train_set_current[revisit_idx_tr])
            # Update the unlabeled set.
            del train_set_new[revisit_idx_un]
            del train_string_new[revisit_idx_un]
            del train_new_vec[revisit_idx_un]
            sim_matrix = np.delete(sim_matrix, revisit_idx_un, 0)
            # Update the tracking record.
            count += len(pseudo_label_idx[revisit_idx_re])
            del new_instance_idx[revisit_idx_re]
            del pseudo_label_idx[revisit_idx_re]
            del visited_idx[revisit_idx_re]
            for i in range(len(visited_idx)):
                if visited_idx[i] > revisit_idx_un:
                    visited_idx[i] = visited_idx[i] - 1
            label_count[num_training + 1] = count
        else:
            # Exhausted search through all substrings.
            # Search substrings with length 2 to len_ptname.
            visited_idx.append(sort_idx)
            y_sequence = crf.tagger_.tag(sent2features(
                train_set_new[sort_idx]))  # generate pseudo-label firstly
            candidate_entropy_list = []
            len_ptname = len(train_set_new[sort_idx])
            for j in range(len_ptname):
                marginal_prob = [
                    crf.tagger_.marginal(k, j) for k in label_list
                ]
                candidate_entropy_list.append(
                    scipy.stats.entropy(marginal_prob))
                # sorted_marginal_prob = np.sort(marginal_prob, kind='mergesort').tolist()
                # sorted_marginal_prob.reverse()
                # candidate_entropy_list.append(sorted_marginal_prob[0]-sorted_marginal_prob[1])
            substring_score = {}
            for i in range(len_ptname - 1):
                for j in range(
                        i + 2, len_ptname
                ):  # should be len_ptname+1 if want to include full string
                    selected_entropy = sum(
                        candidate_entropy_list[i:j]) / (j - i)
                    rest_entropy = (sum(candidate_entropy_list) - sum(
                        candidate_entropy_list[i:j])) / (len_ptname - (j - i))
                    substring_score[(i, j)] = selected_entropy - rest_entropy

            # Rank the substrings based on their scores in descending order.
            sorted_substring_score = sorted(substring_score.items(),
                                            key=operator.itemgetter(1))
            sorted_substring_score.reverse()
            index_tuple1 = sorted_substring_score[0][0]
            index_tuple2 = sorted_substring_score[1][0]
            index_tuple3 = sorted_substring_score[2][0]
            label_index1 = []
            label_index2 = []
            label_index3 = []
            for i in range(index_tuple1[0], index_tuple1[1]):
                label_index1.append(i)
            for i in range(index_tuple2[0], index_tuple2[1]):
                label_index2.append(i)
            for i in range(index_tuple3[0], index_tuple3[1]):
                label_index3.append(i)
            label_index = list(set(label_index1 + label_index2 + label_index3))
            pseudo_index = [
                i for i in range(len_ptname) if i not in label_index
            ]
            pseudo_label_idx.append(pseudo_index)
            # print(label_index, pseudo_index, train_string_new[sort_idx], y_sequence)

            # Apply pseudo-labeling.
            y_sequence_truth = sent2labels(train_set_new[sort_idx])
            pseudo_label_total = 0
            pseudo_label_correct = 0
            for i in label_index:
                count += 1
                if y_sequence[i] == y_sequence_truth[i]:
                    pseudo_label_correct += 1
                y_sequence[i] = y_sequence_truth[i]
                pseudo_label_total += 1
            label_count[num_training + 1] = count
            if pseudo_label_total != 0:
                pseudo_acc[num_training +
                           1] = pseudo_label_correct / pseudo_label_total
            else:
                pseudo_acc[num_training + 1] = 1

            # Update training set.
            new_instance_idx.append(len(train_string_current))
            train_set_current.append(train_set_new[sort_idx])
            train_string_current.append(train_string_new[sort_idx])
            # X_train_current.append(sent2features(train_set_new[sort_idx]))
            y_train_current.append(y_sequence)
            X_train_current = [sent2features(s) for s in train_set_current]
            # del train_set_new[sort_idx]
            # del train_string_new[sort_idx]
            # del train_new_vec[sort_idx]
            # sim_matrix = np.delete(sim_matrix, sort_idx, 0)

        # Update the pseudo labels using the current CRF.
        new_instance_count = 0
        for i in new_instance_idx:
            current_label_seq = y_train_current[i]
            new_pseudo_label_seq = crf.tagger_.tag(X_train_current[i])
            for j in pseudo_label_idx[new_instance_count]:
                current_label_seq[j] = new_pseudo_label_seq[j]
            y_train_current[i] = current_label_seq
            new_instance_count += 1

        # Train the CRF.
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.1,
                                   c2=0.1,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
            y_test, y_pred)
        phrase_acc[num_training + 1] = phrase_correct / phrase_count
        out_acc[num_training + 1] = out_correct / out_count

    return phrase_acc, out_acc, label_count, pseudo_acc
Beispiel #5
0
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    # Initialize arrays to store results.
    phrase_acc = np.zeros([max_samples_batch])
    out_acc = np.zeros([max_samples_batch])
    label_count = np.zeros([max_samples_batch])
    count = 0

    # Define training set and testing set and corresponding original strings.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set and the training pool (unlabeled data).
    train_set_current = train_set[:2]
    train_set_new = train_set[2:]
    train_string_current = train_string[:2]
    train_string_new = train_string[2:]

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train_current, y_train_current)

    # Vectorized and clustered test set.
    num_cluster = 5
    total_string = test_string[:]
    total_string.extend(train_string_new)
    vec, _ = utils.string_vectorize(total_string)
    test_vec = vec[:len(test_string)]
    train_new_vec = vec[len(test_string):].tolist()
    kmeans = KMeans(n_clusters=num_cluster, random_state=0).fit(test_vec)
    cluster_centers = kmeans.cluster_centers_
    cluster_labels = kmeans.labels_

    # Calculate cluster size.
    cluster_size = np.zeros(num_cluster)
    for i in cluster_labels:
        cluster_size[i] += 1
    largest_cluster = np.argmax(cluster_size)
    weight_cluster = [i / sum(cluster_size) for i in cluster_size]

    # Calculate the representative of each test sample by distance to its corresponding cluster center.
    len_test = len(test_set)
    dist_list = np.zeros(len_test)
    for i in range(len_test):
        dist_list[i] = np.linalg.norm(test_vec[i] -
                                      cluster_centers[cluster_labels[i]])

    # Weighted distance to cluster centers for each unlabeled instance.
    distance_to_cluster = []
    for i in range(len(train_new_vec)):
        weighted_distance = [
            weight_cluster[j] *
            np.linalg.norm(train_new_vec[i] - cluster_centers[j])
            for j in range(num_cluster)
        ]
        distance_to_cluster.append(sum(weighted_distance))

    len_test = len(test_set)
    len_ptname = len(test_set[0])

    for num_training in range(max_samples_batch):

        # Want to look at the model confidence using entropy.
        # Calculate entropy for each character of each string in the unlabeled set.
        label_list = crf.tagger_.labels()
        entropy_list = []
        for i in train_set_new:
            crf.tagger_.set(sent2features(i))
            entropy_seq = []
            for j in range(len_ptname):
                marginal_prob = [
                    crf.tagger_.marginal(k, j) for k in label_list
                ]
                entropy_seq.append(scipy.stats.entropy(marginal_prob))
            entropy_list.append(entropy_seq)

        # Select the string with the largest candidate score.
        candidate_score = []
        for i in range(len(entropy_list)):
            candidate_score.append(
                sum(entropy_list[i]) / distance_to_cluster[i])
        sort_idx = np.argmax(candidate_score)

        # Find the sample with the maximal score and only label the part with low confidence/high entropy.
        y_sequence = crf.tagger_.tag(sent2features(
            train_set_new[sort_idx]))  # generate pseudo-label firstly
        entropy_tmp = entropy_list[sort_idx]
        mean_entropy_tmp = np.mean(entropy_tmp)
        std_entropy_tmp = np.std(entropy_tmp)
        z_score = [(entropy_tmp[i] - mean_entropy_tmp) / std_entropy_tmp
                   for i in range(len_ptname)]
        y_sequence_truth = sent2labels(train_set_new[sort_idx])
        # print(entropy_tmp, z_score, y_sequence, y_sequence_truth)
        for i in range(len_ptname):
            if z_score[i] > 0.1:
                count += 1
                y_sequence[i] = y_sequence_truth[i]
        label_count[num_training] = count

        # Update training set.
        # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]]
        sample_to_remove = [train_set_new[sort_idx]]
        for i in sample_to_remove:
            train_set_current.append(i)
            train_set_new.remove(i)
            X_train_current.append(sent2features(i))
            y_train_current.append(y_sequence)
            # print(X_train_current)
        # string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]]
        string_to_remove = [train_string_new[sort_idx]]
        for i in string_to_remove:
            train_string_current.append(i)
            train_string_new.remove(i)
        # Remove the pre-calculate vectors and distances.
        del train_new_vec[sort_idx]
        del distance_to_cluster[sort_idx]

        # # define fixed parameters and parameters to search
        # crf = sklearn_crfsuite.CRF(
        #     algorithm='lbfgs',
        #     max_iterations=100,
        #     all_possible_transitions=True
        # )
        # params_space = {
        #     'c1': scipy.stats.expon(scale=0.5),
        #     'c2': scipy.stats.expon(scale=0.05),
        # }
        #
        # # search
        # rs = RandomizedSearchCV(crf, params_space,
        #                         cv=2,
        #                         verbose=1,
        #                         n_jobs=-1,
        #                         n_iter=5)
        # rs.fit(X_train_current, y_train_current)
        #
        # print('best params:', rs.best_params_)
        # print('best CV score:', rs.best_score_)
        # crf = rs.best_estimator_

        # Train the CRF.
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.1,
                                   c2=0.1,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
            y_test, y_pred)
        # print(phrase_count, phrase_correct, out_count, out_correct)
        phrase_acc[num_training] = phrase_correct / phrase_count
        out_acc[num_training] = out_correct / out_count

    return phrase_acc, out_acc, label_count
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    phrase_acc = np.zeros([max_samples_batch + 1])
    out_acc = np.zeros([max_samples_batch + 1])
    label_count = np.zeros([max_samples_batch + 1])

    # Define training set and testing set.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set from the training pool.
    initial_size = 10
    train_set_current = train_set[:initial_size]
    train_set_new = train_set[initial_size:]
    train_string_current = train_string[:initial_size]
    train_string_new = train_string[initial_size:]
    for i in range(initial_size):
        label_count[0] += len(train_string[i])

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train_current, y_train_current)

    # Use the estimator.
    y_pred = crf.predict(X_test)
    phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
        y_test, y_pred)
    phrase_acc[0] = phrase_correct / phrase_count
    out_acc[0] = out_correct / out_count

    for num_training in range(max_samples_batch):

        # Calculate the confidence on the training pool (train_set_new) using the current CRF.
        X_train_new = [sent2features(s) for s in train_set_new]
        len_train_new = len(train_set_new)
        prob_list = []
        for i in range(len_train_new):
            #crf.tagger_.set(X_train_new[i])
            y_sequence = crf.tagger_.tag(X_train_new[i])
            #print(crf.tagger_.probability(y_sequence))
            # normalized sequence probability
            prob_norm = math.exp(
                math.log(crf.tagger_.probability(y_sequence)) /
                len(train_string_new[i]))
            prob_list.append(prob_norm)

        # Sort the training pool based on confidence.
        sort_idx = np.argsort(np.array(prob_list), kind='mergesort').tolist()

        # if (num_training>=0)&(num_training<=20):
        #     print([train_string_new[i] for i in sort_idx[:batch_size]])

        label_count[num_training + 1] = label_count[num_training] + len(
            train_set_new[sort_idx[0]])  # assume batch_size = 1
        # update training set
        sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]]
        for i in sample_to_remove:
            train_set_current.append(i)
            train_set_new.remove(i)

        # Obtain current training features.
        X_train_current = [sent2features(s) for s in train_set_current]
        y_train_current = [sent2labels(s) for s in train_set_current]

        # Train the CRF.
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.1,
                                   c2=0.1,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
            y_test, y_pred)
        # print(phrase_count, phrase_correct, out_count, out_correct)
        phrase_acc[num_training + 1] = phrase_correct / phrase_count
        out_acc[num_training + 1] = out_correct / out_count

    return phrase_acc, out_acc, label_count
Beispiel #7
0
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    phrase_acc = np.zeros([max_samples_batch + 1])
    out_acc = np.zeros([max_samples_batch + 1])
    label_count = np.zeros([max_samples_batch + 1])

    # Define training set and testing set.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set from the training pool.
    initial_size = 10
    train_set_current = train_set[:initial_size]
    train_set_new = train_set[initial_size:]
    train_string_current = train_string[:initial_size]
    train_string_new = train_string[initial_size:]
    for i in range(initial_size):
        label_count[0] += len(train_string[i])

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train_current, y_train_current)

    # Use the estimator.
    y_pred = crf.predict(X_test)
    phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
        y_test, y_pred)
    phrase_acc[0] = phrase_correct / phrase_count
    out_acc[0] = out_correct / out_count

    # Vectorize the unlabeled set.
    vec, _ = utils.string_vectorize(train_string_new)
    vec = vec.tolist()

    # Pre-calculate similarity.
    ini_unlabeled_size = len(vec)
    sim_matrix = np.zeros((ini_unlabeled_size, ini_unlabeled_size))
    for i in range(ini_unlabeled_size):
        for j in range(i):
            sim_matrix[i, j] = 1 - spatial.distance.cosine(vec[i], vec[j])
            sim_matrix[j, i] = sim_matrix[i, j]

    for num_training in range(max_samples_batch):

        # Calculate the confidence on the training pool (train_set_new) using the current CRF.
        X_train_new = [sent2features(s) for s in train_set_new]
        len_train_new = len(train_set_new)
        prob_list = []
        for i in range(len_train_new):
            #crf.tagger_.set(X_train_new[i])
            y_sequence = crf.tagger_.tag(X_train_new[i])
            #print(crf.tagger_.probability(y_sequence))
            # normalized sequence probability
            prob_norm = math.exp(
                math.log(crf.tagger_.probability(y_sequence)) /
                len(train_string_new[i]))
            prob_list.append(1 - prob_norm)

        # Calculate the average similarity to all other unlabeled sample.
        sim_list = np.sum(sim_matrix, axis=0) / len_train_new

        # Calculate information density.
        info_den = [prob_list[i] * sim_list[i] for i in range(len_train_new)]

        # Sort the training pool based on confidence.
        sort_idx = np.argsort(-np.array(info_den), kind='mergesort').tolist()

        # if (num_training>=20)&(num_training<=40):
        #     print([train_string_new[i] for i in sort_idx[:batch_size]])

        # update training set
        label_count[num_training + 1] = label_count[num_training] + len(
            train_set_new[sort_idx[0]])  # assume batch_size = 1
        # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]]
        # for i in sample_to_remove:
        #     train_set_current.append(i)
        #     train_set_new.remove(i)
        idx_to_remove = sort_idx[:batch_size]
        idx_to_remove = np.sort(idx_to_remove, kind='mergesort').tolist()
        for i in range(batch_size):
            sim_matrix = np.delete(sim_matrix, idx_to_remove[-i - 1], 0)
            sim_matrix = np.delete(sim_matrix, idx_to_remove[-i - 1], 1)
            train_set_current.append(train_set_new[idx_to_remove[-i - 1]])
            del train_set_new[idx_to_remove[-i - 1]]

        # Obtain current training features.
        X_train_current = [sent2features(s) for s in train_set_current]
        y_train_current = [sent2labels(s) for s in train_set_current]

        # Train the CRF.
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.1,
                                   c2=0.1,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
            y_test, y_pred)
        # print(phrase_count, phrase_correct, out_count, out_correct)
        phrase_acc[num_training + 1] = phrase_correct / phrase_count
        out_acc[num_training + 1] = out_correct / out_count

    return phrase_acc, out_acc, label_count
Beispiel #8
0
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    # Initialize arrays to store results.
    phrase_acc = np.zeros([max_samples_batch])
    out_acc = np.zeros([max_samples_batch])
    label_count = np.zeros([max_samples_batch])
    pseudo_acc = np.zeros([max_samples_batch])
    count = 0

    # Define training set and testing set and corresponding original strings.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set and the training pool (unlabeled data).
    train_set_current = train_set[:2]
    train_set_new = train_set[2:]
    train_string_current = train_string[:2]
    train_string_new = train_string[2:]

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train_current, y_train_current)

    # Vectorized and clustered test set.
    num_cluster = 5
    total_string = test_string[:]
    total_string.extend(train_string_new)
    vec, _ = utils.string_vectorize(total_string)
    test_vec = vec[:len(test_string)]
    train_new_vec = vec[len(test_string):].tolist()
    kmeans = KMeans(n_clusters=num_cluster, random_state=0).fit(test_vec)
    cluster_centers = kmeans.cluster_centers_
    cluster_labels = kmeans.labels_

    # Calculate cluster size.
    cluster_size = np.zeros(num_cluster)
    for i in cluster_labels:
        cluster_size[i] += 1
    largest_cluster = np.argmax(cluster_size)
    weight_cluster = [i / sum(cluster_size) for i in cluster_size]

    # Calculate the representative of each test sample by distance to its corresponding cluster center.
    len_test = len(test_set)
    dist_list = np.zeros(len_test)
    for i in range(len_test):
        dist_list[i] = np.linalg.norm(test_vec[i] -
                                      cluster_centers[cluster_labels[i]])

    # Weighted distance to cluster centers for each unlabeled instance.
    distance_to_cluster = []
    for i in range(len(train_new_vec)):
        weighted_distance = [
            weight_cluster[j] *
            np.linalg.norm(train_new_vec[i] - cluster_centers[j])
            for j in range(num_cluster)
        ]
        distance_to_cluster.append(sum(weighted_distance))

    len_test = len(test_set)
    len_ptname = len(test_set[0])
    labeled_instance = [
    ]  # Store the indices of partial labeled instances in the unlabeled set
    labeled_positions = [
    ]  # Store the manual labeled positions for partial labeled instances
    labeled_istance_train = [
    ]  # Store the location where the instance added to (the training set)

    for num_training in range(max_samples_batch):

        # Want to look at the model confidence using entropy.
        # Calculate entropy for each character of each string in the test set.
        label_list = crf.tagger_.labels()
        entropy_list = []
        for i in test_set:
            crf.tagger_.set(sent2features(i))
            entropy_seq = []
            for j in range(len_ptname):
                marginal_prob = [
                    crf.tagger_.marginal(k, j) for k in label_list
                ]
                entropy_seq.append(scipy.stats.entropy(marginal_prob))
            entropy_list.append(entropy_seq)

        # Sort the test set based on the entropy sum.
        entropy_sum = [sum(i) for i in entropy_list]
        sort_idx_temp = np.argsort(-np.array(entropy_sum),
                                   kind='mergesort').tolist()

        # Select the string with the minimum average distance to the selected group.
        temp_set = [test_string[i] for i in sort_idx_temp[:2]]
        distance = utils.avr_edit_distance(temp_set, train_string_new, True)
        # sorted_idx = np.argsort(distance, kind='mergesort').tolist()
        sort_idx = np.argmin(distance)

        # Store the index of the selected instance.
        if sort_idx not in labeled_instance:
            labeled_instance.append(sort_idx)
            revisit_flag = False
        else:
            revisit_flag = True

        # Only label the part with low confidence/high entropy.
        y_sequence = crf.tagger_.tag(sent2features(
            train_set_new[sort_idx]))  # generate pseudo-label firstly
        entropy_tmp = []
        for j in range(len_ptname):
            marginal_prob = [crf.tagger_.marginal(k, j) for k in label_list]
            entropy_tmp.append(scipy.stats.entropy(marginal_prob))
        y_sequence_truth = sent2labels(train_set_new[sort_idx])

        tmp_position = []
        tmp_count = 0
        if revisit_flag:
            tmp_position = labeled_positions[labeled_instance.index(sort_idx)]
            entropy_tmp_revise = []
            for i in range(len_ptname):
                if i not in tmp_position:
                    entropy_tmp_revise.append(entropy_tmp[i])
            mean_entropy_tmp = np.mean(entropy_tmp_revise)
            std_entropy_tmp = np.std(entropy_tmp_revise)
            if len(entropy_tmp_revise) == 1:
                z_score_revise = [100]
            else:
                z_score_revise = [(entropy_tmp_revise[i] - mean_entropy_tmp) /
                                  std_entropy_tmp
                                  for i in range(len(entropy_tmp_revise))]
            z_score = []
            j = 0
            for i in range(len_ptname):
                if i in tmp_position:
                    z_score.append(-100)
                else:
                    z_score.append(z_score_revise[j])
                    j = j + 1
            train_position = labeled_istance_train[labeled_instance.index(
                sort_idx)]
            y_sequence = y_train_current[train_position]
            for i in range(len_ptname):
                if i not in tmp_position:
                    if z_score[i] > 0:
                        count += 1
                        tmp_count += 1
                        y_sequence[i] = y_sequence_truth[i]
                        tmp_position.append(i)
            # Check if no character is labeled.
            if tmp_count == 0:
                for i in range(len_ptname):
                    if i not in tmp_position:
                        count += 1
                        tmp_count += 1
                        y_sequence[i] = y_sequence_truth[i]
                        tmp_position.append(i)
                        break
            # Sort the tmp_position.
            tmp_position = np.sort(tmp_position, kind='mergesort').tolist()
            labeled_positions[labeled_instance.index(sort_idx)] = tmp_position
        else:
            mean_entropy_tmp = np.mean(entropy_tmp)
            std_entropy_tmp = np.std(entropy_tmp)
            z_score = [(entropy_tmp[i] - mean_entropy_tmp) / std_entropy_tmp
                       for i in range(len_ptname)]
            for i in range(len_ptname):
                if z_score[i] > 0:
                    count += 1
                    tmp_count += 1
                    y_sequence[i] = y_sequence_truth[i]
                    tmp_position.append(i)
            labeled_positions.append(tmp_position)
        label_count[num_training] = count

        # Update training set.
        if revisit_flag:
            y_train_current[train_position] = y_sequence
        else:
            # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]]
            sample_to_remove = [train_set_new[sort_idx]]
            for i in sample_to_remove:
                train_set_current.append(i)
                #train_set_new.remove(i)
                X_train_current.append(sent2features(i))
                y_train_current.append(y_sequence)
                # print(X_train_current)
            # string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]]
            string_to_remove = [train_string_new[sort_idx]]
            for i in string_to_remove:
                train_string_current.append(i)
                labeled_istance_train.append(len(train_string_current) - 1)
                # train_string_new.remove(i)
            # Remove the pre-calculate vectors and distances.
            #del train_new_vec[sort_idx]
            #del distance_to_cluster[sort_idx]

        # Remove the full labeled instances from the unlabeled set.
        tmp_idx_record = []
        for i in range(len(labeled_positions)):
            if len(labeled_positions[i]) == len_ptname:
                tmp_idx_record.append(i)
        idx_record = []
        for i in range(len(tmp_idx_record)):
            j = len(tmp_idx_record) - i - 1
            idx_record.append(labeled_instance[tmp_idx_record[j]])
            del labeled_instance[j]
            del labeled_positions[j]
            del labeled_istance_train[j]
        idx_record = np.sort(idx_record, kind='mergesort').tolist()
        for i in range(len(idx_record)):
            j = len(idx_record) - i - 1
            del train_set_new[j]
            del train_string_new[j]

        # Train the CRF.
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.1,
                                   c2=0.1,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
            y_test, y_pred)
        # print(phrase_count, phrase_correct, out_count, out_correct)
        phrase_acc[num_training] = phrase_correct / phrase_count
        out_acc[num_training] = out_correct / out_count

    return phrase_acc, out_acc, label_count
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    # Initialize arrays to store results.
    phrase_acc = np.zeros([max_samples_batch])
    out_acc = np.zeros([max_samples_batch])
    label_count = np.zeros([max_samples_batch])
    count = 0

    # Define training set and testing set and corresponding original strings.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set and the training pool (unlabeled data).
    train_set_current = train_set[:2]
    train_set_new = train_set[2:]
    train_string_current = train_string[:2]
    train_string_new = train_string[2:]

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train_current, y_train_current)

    # len_test = len(test_set)
    len_ptname = len(test_set[0])
    for num_training in range(max_samples_batch):

        # Want to look at the confidence (entropy for each character of each string) on unlabeled data.
        label_list = crf.tagger_.labels()
        entropy_list = []
        for i in train_set_new:
            crf.tagger_.set(sent2features(i))
            entropy_seq = []
            for j in range(len_ptname):
                marginal_prob = [crf.tagger_.marginal(k, j) for k in label_list]
                entropy_seq.append(scipy.stats.entropy(marginal_prob))
            entropy_list.append(entropy_seq)

        # # Select the string with the largest entropy difference.
        # difference_list = []
        # for i in entropy_list:
        #     difference_list.append(max(i) - min(i))
        # sort_idx = np.argmax(difference_list)

        # Select the string with the largest entropy sum.
        candidate_score = []
        for i in range(len(entropy_list)):
            candidate_score.append(sum(entropy_list[i]))
        sort_idx = np.argmax(candidate_score)

        # Find the sample with minimum confidence and only label the part with low confidence.
        entropy_tmp = entropy_list[sort_idx]
        y_sequence = crf.tagger_.tag(sent2features(train_set_new[sort_idx]))
        mean_entropy_tmp = np.mean(entropy_tmp)
        std_entropy_tmp = np.std(entropy_tmp)
        z_score = [(entropy_tmp[i] - mean_entropy_tmp) / std_entropy_tmp for i in range(len_ptname)]
        for i in range(len_ptname):
            if z_score[i] > 0.1:
                count += 1
                y_sequence[i] = sent2labels(train_set_new[sort_idx])[i]
        label_count[num_training] = count

        # Update training set.
        # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]]
        sample_to_remove = [train_set_new[sort_idx]]
        for i in sample_to_remove:
            train_set_current.append(i)
            train_set_new.remove(i)
            X_train_current.append(sent2features(i))
            # print(X_train_current)
            y_train_current.append(y_sequence)
        # string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]]
        string_to_remove = [train_string_new[sort_idx]]
        for i in string_to_remove:
            train_string_current.append(i)
            train_string_new.remove(i)

        # Train the CRF.
        crf = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True
        )
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(y_test, y_pred)
        # print(phrase_count, phrase_correct, out_count, out_correct)
        phrase_acc[num_training] = phrase_correct / phrase_count
        out_acc[num_training] = out_correct / out_count

    return phrase_acc, out_acc, label_count
Beispiel #10
0
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    phrase_acc = np.zeros([max_samples_batch+1])
    out_acc = np.zeros([max_samples_batch+1])
    label_count = np.zeros([max_samples_batch+1])

    # Define training set and testing set.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set from the training pool.
    initial_size = 10
    train_set_current = train_set[:initial_size]
    train_set_new = train_set[initial_size:]
    train_string_current = train_string[:initial_size]
    train_string_new = train_string[initial_size:]
    for i in range(initial_size):
        label_count[0] += len(train_string[i])

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train_current, y_train_current)

    # Use the estimator.
    y_pred = crf.predict(X_test)
    phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(y_test, y_pred)
    phrase_acc[0] = phrase_correct / phrase_count
    out_acc[0] = out_correct / out_count

    for num_training in range(max_samples_batch):

        # Uniformly take samples from the training pool.
        len_train_new = len(train_set_new)
        sample_idx = []
        for i in range(batch_size):
            rand_tmp = random.randint(0,len_train_new-1)
            while rand_tmp in sample_idx:
                rand_tmp = random.randint(0,len_train_new-1)
            sample_idx.append(rand_tmp)

        label_count[num_training + 1] = label_count[num_training] + len(
            train_set_new[sample_idx[0]])  # assume batch_size = 1
        # update training strings
        string_to_remove = [train_string_new[i] for i in sample_idx[:batch_size]]
        for i in string_to_remove:
            train_string_current.append(i)
            train_string_new.remove(i)
        # update training set
        sample_to_remove = [train_set_new[i] for i in sample_idx[:batch_size]]
        for i in sample_to_remove:
            train_set_current.append(i)
            train_set_new.remove(i)

        # Obtain current training features.
        X_train_current = [sent2features(s) for s in train_set_current]
        y_train_current = [sent2labels(s) for s in train_set_current]

        # # define fixed parameters and parameters to search
        # crf = sklearn_crfsuite.CRF(
        #     algorithm='lbfgs',
        #     max_iterations=100,
        #     all_possible_transitions=True
        # )
        # params_space = {
        #     'c1': scipy.stats.expon(scale=0.5),
        #     'c2': scipy.stats.expon(scale=0.05),
        # }
        #
        # # search
        # rs = RandomizedSearchCV(crf, params_space,
        #                         cv=2,
        #                         verbose=1,
        #                         n_jobs=-1,
        #                         n_iter=5)
        # rs.fit(X_train_current, y_train_current)
        #
        # print('best params:', rs.best_params_)
        # print('best CV score:', rs.best_score_)
        # crf = rs.best_estimator_

        # Train the CRF.
        crf = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True
        )
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(y_test, y_pred)
        # print(phrase_count, phrase_correct, out_count, out_correct)
        phrase_acc[num_training+1] = phrase_correct / phrase_count
        out_acc[num_training+1] = out_correct / out_count

    return phrase_acc, out_acc, label_count
Beispiel #11
0
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    phrase_acc = np.zeros([max_samples_batch + 1])
    out_acc = np.zeros([max_samples_batch + 1])
    label_count = np.zeros([max_samples_batch + 1])

    # Define training set and testing set.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set from the training pool.
    initial_size = 10
    train_set_current = train_set[:initial_size]
    train_set_new = train_set[initial_size:]
    train_string_current = train_string[:initial_size]
    train_string_new = train_string[initial_size:]
    for i in range(initial_size):
        label_count[0] += len(train_string[i])

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train_current, y_train_current)

    # Use the estimator.
    y_pred = crf.predict(X_test)
    phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(y_test, y_pred)
    phrase_acc[0] = phrase_correct / phrase_count
    out_acc[0] = out_correct / out_count

    # Vectorized and clustered test set.
    total_string = test_string[:]
    total_string.extend(train_string_new)
    vec, _ = utils.string_vectorize(total_string)
    test_vec = vec[:len(test_string)].tolist()
    train_new_vec = vec[len(test_string):].tolist()

    # Pre-calculate similarity.
    # This will be efficient if the number of iterations is large.
    sim_matrix = np.zeros((len(train_new_vec), len(test_vec)))
    for i in range(len(train_new_vec)):
        for j in range(len(test_vec)):
            sim_matrix[i, j] = 1 - spatial.distance.cosine(train_new_vec[i], test_vec[j])

    len_test = len(test_set)

    for num_training in range(max_samples_batch):

        # Calculate the confidence on the testing set using the current CRF.
        prob_list = []
        for i in range(len_test):
            # crf.tagger_.set(X_train_new[i])
            y_sequence = crf.tagger_.tag(X_test[i])
            # print(crf.tagger_.probability(y_sequence))
            # normalized sequence probability
            prob_norm = math.exp(math.log(crf.tagger_.probability(y_sequence)) / len(test_string[i]))
            prob_list.append(prob_norm)

        # Sort the test set based on confidence.
        sort_idx_temp = np.argsort(np.array(prob_list), kind='mergesort').tolist()

        # Calculate the average similarity between the unlabeled samples and the selected test samples.
        # temp_set = [test_string[i] for i in sort_idx_temp[:5]]
        # distance = utils.avr_edit_distance(temp_set, train_string_new, True)
        group_size = 80
        avr_sim = np.sum(sim_matrix[:, sort_idx_temp[:group_size]], axis=1)/group_size
        distance = avr_sim

        # We want to have information weighted by such distance.
        X_train_new = [sent2features(s) for s in train_set_new]
        len_train_new = len(train_set_new)
        prob_list_candidate = []
        for i in range(len_train_new):
            y_sequence = crf.tagger_.tag(X_train_new[i])
            prob_norm = math.exp(math.log(crf.tagger_.probability(y_sequence)) / len(train_string_new[i]))
            prob_list_candidate.append(prob_norm)
        candidate_score = []
        for i in range(len_train_new):
            if distance[i] == 0:
                candidate_score.append(sys.float_info.max)
            else:
                candidate_score.append(prob_list_candidate[i] / distance[i])

        sort_idx = np.argsort(candidate_score, kind='mergesort').tolist()

        # if (num_training>=20)&(num_training<=40):
        #     print([train_string_new[i] for i in sort_idx[:batch_size]])

        # Assume the batch size is 1.
        label_count[num_training + 1] = label_count[num_training] + len(train_set_new[sort_idx[0]])

        # update training set
        # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]]
        # for i in sample_to_remove:
        #     train_set_current.append(i)
        #     train_set_new.remove(i)
        # string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]]
        # for i in string_to_remove:
        #     train_string_current.append(i)
        #     train_string_new.remove(i)
        idx_to_remove = sort_idx[:batch_size]
        idx_to_remove = np.sort(idx_to_remove, kind='mergesort').tolist()
        for i in range(batch_size):
            sim_matrix = np.delete(sim_matrix, idx_to_remove[-i-1], 0)
            train_set_current.append(train_set_new[idx_to_remove[-i-1]])
            del train_set_new[idx_to_remove[-i-1]]
            train_string_current.append(train_string_new[idx_to_remove[-i-1]])
            del train_string_new[idx_to_remove[-i-1]]

        # Obtain current training features.
        X_train_current = [sent2features(s) for s in train_set_current]
        y_train_current = [sent2labels(s) for s in train_set_current]

        # # define fixed parameters and parameters to search
        # crf = sklearn_crfsuite.CRF(
        #     algorithm='lbfgs',
        #     max_iterations=100,
        #     all_possible_transitions=True
        # )
        # params_space = {
        #     'c1': scipy.stats.expon(scale=0.5),
        #     'c2': scipy.stats.expon(scale=0.05),
        # }
        #
        # # search
        # rs = RandomizedSearchCV(crf, params_space,
        #                         cv=2,
        #                         verbose=1,
        #                         n_jobs=-1,
        #                         n_iter=5)
        # rs.fit(X_train_current, y_train_current)
        #
        # print('best params:', rs.best_params_)
        # print('best CV score:', rs.best_score_)
        # crf = rs.best_estimator_

        # Train the CRF.
        crf = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True
        )
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        # print('Iteration: ', num_training)
        # for i in range(10):
        #     print('\nstring: ', test_string[i])
        #     print('predction and ground truth:')
        #     print(y_pred[i])
        #     print(y_test[i])
        # print('\n')
        # print('\n')
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(y_test, y_pred)
        # print(phrase_count, phrase_correct, out_count, out_correct)
        phrase_acc[num_training+1] = phrase_correct / phrase_count
        out_acc[num_training+1] = out_correct / out_count

    return phrase_acc, out_acc, label_count
def cv_edit_active_learn(args):

    # Read the input args.
    train_idx = args['train_idx']
    test_idx = args['test_idx']
    dataset = args['dataset']
    strings = args['strings']
    max_samples_batch = args['max_samples_batch']
    batch_size = args['batch_size']

    phrase_acc = np.zeros([max_samples_batch])
    out_acc = np.zeros([max_samples_batch])

    # Define training set and testing set.
    train_set = [dataset[i] for i in train_idx]
    test_set = [dataset[i] for i in test_idx]
    train_string = [strings[i] for i in train_idx]
    test_string = [strings[i] for i in test_idx]

    # Define an initial actual training set from the training pool.
    train_set_current = train_set[:2]
    train_set_new = train_set[2:]
    train_string_current = train_string[:2]
    train_string_new = train_string[2:]

    # Obtain testing features and labels.
    X_test = [sent2features(s) for s in test_set]
    y_test = [sent2labels(s) for s in test_set]

    # Train a CRF using the current training set.
    X_train_current = [sent2features(s) for s in train_set_current]
    y_train_current = [sent2labels(s) for s in train_set_current]
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_train_current, y_train_current)

    # Vectorized and clustered test set.
    num_cluster = 5
    total_string = test_string[:]
    total_string.extend(train_string_new)
    vec, _ = utils.string_vectorize(total_string)
    test_vec = vec[:len(test_string)]
    train_new_vec = vec[len(test_string):].tolist()
    kmeans = KMeans(n_clusters=num_cluster, random_state=0).fit(test_vec)
    cluster_centers = kmeans.cluster_centers_
    cluster_labels = kmeans.labels_

    # Calculate cluster size.
    cluster_size = np.zeros(num_cluster)
    for i in cluster_labels:
        cluster_size[i] += 1
    largest_cluster = np.argmax(cluster_size)
    weight_cluster = [i / sum(cluster_size) for i in cluster_size]

    # Calculate the representative of each test sample by distance to its corresponding cluster center.
    len_test = len(test_set)
    dist_list = np.zeros(len_test)
    for i in range(len_test):
        dist_list[i] = np.linalg.norm(test_vec[i] -
                                      cluster_centers[cluster_labels[i]])

    distance_to_cluster = []
    for i in range(len(train_new_vec)):
        weighted_distance = [
            weight_cluster[j] *
            np.linalg.norm(train_new_vec[i] - cluster_centers[j])
            for j in range(num_cluster)
        ]
        distance_to_cluster.append(sum(weighted_distance))

    for num_training in range(max_samples_batch):

        # Calculate the confidence on the unlabeled set using the current CRF.
        len_new = len(train_string_new)
        train_new_prob_list = np.zeros(len_new)
        for i in range(len_new):
            y_sequence = crf.tagger_.tag(sent2features(train_string_new[i]))
            train_new_prob_list[i] = crf.tagger_.probability(y_sequence)

        # # Construct a new indicator (confidence and representative) to pick out a sample from the test set.
        # test_indicator = [i[0] for i in zip(test_prob_list, dist_list)]
        #
        # # Sort the test set based on the new indicator.
        # sort_idx_temp = np.argsort(np.array(test_indicator), kind='mergesort').tolist()
        #
        # # Calculate the distance from unlabeled samples to the selected test sample(s).
        # tmp_set = [test_vec[i] for i in sort_idx_temp[:1]]
        # distance = np.zeros(len(train_new_vec))
        # for i in range(len(train_new_vec)):
        #     tmp_distance = [np.linalg.norm(train_new_vec[i] - j) for j in tmp_set]
        #     distance[i] = np.average(tmp_distance)

        # # Calculate the confidence on the unlabeled samples.
        # train_prob_list = []
        # len_unlabeled = len(train_set_new)
        # X_train_new = [sent2features(s) for s in train_set_new]
        # for i in range(len_unlabeled):
        #     y_sequence = crf.tagger_.tag(X_train_new[i])
        #     train_prob_list.append(crf.tagger_.probability(y_sequence))
        #
        # # Construct a new indicator (confidence and distance) to pick out unlabeled samples.
        # train_indicator = [i[0]*i[1] for i in zip(train_prob_list, distance)]
        train_indicator = [
            i[0] / i[1] for i in zip(train_new_prob_list, distance_to_cluster)
        ]

        # Sort the unlabeled samples based on the new indicator.
        sort_idx = np.argsort(train_indicator, kind='mergesort').tolist()

        # if (num_training>=20)&(num_training<=40):
        #     print([train_string_new[i] for i in sort_idx[:batch_size]])

        # update training set
        sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]]
        for i in sample_to_remove:
            train_set_current.append(i)
            train_set_new.remove(i)
        string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]]
        for i in string_to_remove:
            train_string_current.append(i)
            train_string_new.remove(i)
        idx_for_delete = np.sort(sort_idx[:batch_size])
        for i in range(1, batch_size + 1, 1):
            del train_new_vec[idx_for_delete[-i]]
            del distance_to_cluster[idx_for_delete[-i]]

        # Obtain current training features.
        X_train_current = [sent2features(s) for s in train_set_current]
        y_train_current = [sent2labels(s) for s in train_set_current]

        # # define fixed parameters and parameters to search
        # crf = sklearn_crfsuite.CRF(
        #     algorithm='lbfgs',
        #     max_iterations=100,
        #     all_possible_transitions=True
        # )
        # params_space = {
        #     'c1': scipy.stats.expon(scale=0.5),
        #     'c2': scipy.stats.expon(scale=0.05),
        # }
        #
        # # search
        # rs = RandomizedSearchCV(crf, params_space,
        #                         cv=2,
        #                         verbose=1,
        #                         n_jobs=-1,
        #                         n_iter=5)
        # rs.fit(X_train_current, y_train_current)
        #
        # print('best params:', rs.best_params_)
        # print('best CV score:', rs.best_score_)
        # crf = rs.best_estimator_

        # Train the CRF.
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   c1=0.1,
                                   c2=0.1,
                                   max_iterations=100,
                                   all_possible_transitions=True)
        crf.fit(X_train_current, y_train_current)

        # Use the estimator.
        y_pred = crf.predict(X_test)
        phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(
            y_test, y_pred)
        # print(phrase_count, phrase_correct, out_count, out_correct)
        phrase_acc[num_training] = phrase_correct / phrase_count
        out_acc[num_training] = out_correct / out_count

    return phrase_acc, out_acc