def cv_edit_active_learn(args): # Read the input args. train_idx = args['train_idx'] test_idx = args['test_idx'] dataset = args['dataset'] strings = args['strings'] max_samples_batch = args['max_samples_batch'] batch_size = args['batch_size'] # Initialize arrays to store results. phrase_acc = np.zeros([max_samples_batch]) out_acc = np.zeros([max_samples_batch]) label_count = np.zeros([max_samples_batch]) count = 0 # Define training set and testing set and corresponding original strings. train_set = [dataset[i] for i in train_idx] test_set = [dataset[i] for i in test_idx] train_string = [strings[i] for i in train_idx] test_string = [strings[i] for i in test_idx] # Define an initial actual training set and the training pool (unlabeled data). train_set_current = train_set[:2] train_set_new = train_set[2:] train_string_current = train_string[:2] train_string_new = train_string[2:] # Obtain testing features and labels. X_test = [sent2features(s) for s in test_set] y_test = [sent2labels(s) for s in test_set] # Train a CRF using the current training set. X_train_current = [sent2features(s) for s in train_set_current] y_train_current = [sent2labels(s) for s in train_set_current] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Vectorized and clustered test set. num_cluster = 5 total_string = test_string[:] total_string.extend(train_string_new) vec, _ = utils.string_vectorize(total_string) test_vec = vec[:len(test_string)] train_new_vec = vec[len(test_string):].tolist() kmeans = KMeans(n_clusters=num_cluster, random_state=0).fit(test_vec) cluster_centers = kmeans.cluster_centers_ cluster_labels = kmeans.labels_ # Calculate cluster size. cluster_size = np.zeros(num_cluster) for i in cluster_labels: cluster_size[i] += 1 largest_cluster = np.argmax(cluster_size) weight_cluster = [i / sum(cluster_size) for i in cluster_size] # Calculate the representative of each test sample by distance to its corresponding cluster center. len_test = len(test_set) dist_list = np.zeros(len_test) for i in range(len_test): dist_list[i] = np.linalg.norm(test_vec[i] - cluster_centers[cluster_labels[i]]) # Weighted distance to cluster centers for each unlabeled instance. distance_to_cluster = [] for i in range(len(train_new_vec)): weighted_distance = [ weight_cluster[j] * np.linalg.norm(train_new_vec[i] - cluster_centers[j]) for j in range(num_cluster) ] distance_to_cluster.append(sum(weighted_distance)) len_test = len(test_set) len_ptname = len(test_set[0]) for num_training in range(max_samples_batch): # Want to look at the model confidence using entropy. # Calculate entropy for each character of each string in the unlabeled set. label_list = crf.tagger_.labels() entropy_list = [] for i in train_set_new: crf.tagger_.set(sent2features(i)) entropy_seq = [] for j in range(len_ptname): marginal_prob = [ crf.tagger_.marginal(k, j) for k in label_list ] entropy_seq.append(scipy.stats.entropy(marginal_prob)) entropy_list.append(entropy_seq) # Select the string with the largest candidate score. candidate_score = [] for i in range(len(entropy_list)): candidate_score.append( sum(entropy_list[i]) / distance_to_cluster[i]) sort_idx = np.argmax(candidate_score) # Find the sample with the maximal score and only label the part with low confidence/high entropy. y_sequence = crf.tagger_.tag(sent2features( train_set_new[sort_idx])) # generate pseudo-label firstly entropy_tmp = entropy_list[sort_idx] mean_entropy_tmp = np.mean(entropy_tmp) std_entropy_tmp = np.std(entropy_tmp) z_score = [(entropy_tmp[i] - mean_entropy_tmp) / std_entropy_tmp for i in range(len_ptname)] y_sequence_truth = sent2labels(train_set_new[sort_idx]) # print(entropy_tmp, z_score, y_sequence, y_sequence_truth) for i in range(len_ptname): if z_score[i] > 0.1: count += 1 y_sequence[i] = y_sequence_truth[i] label_count[num_training] = count # Update training set. # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]] sample_to_remove = [train_set_new[sort_idx]] for i in sample_to_remove: train_set_current.append(i) train_set_new.remove(i) X_train_current.append(sent2features(i)) y_train_current.append(y_sequence) # print(X_train_current) # string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]] string_to_remove = [train_string_new[sort_idx]] for i in string_to_remove: train_string_current.append(i) train_string_new.remove(i) # Remove the pre-calculate vectors and distances. del train_new_vec[sort_idx] del distance_to_cluster[sort_idx] # # define fixed parameters and parameters to search # crf = sklearn_crfsuite.CRF( # algorithm='lbfgs', # max_iterations=100, # all_possible_transitions=True # ) # params_space = { # 'c1': scipy.stats.expon(scale=0.5), # 'c2': scipy.stats.expon(scale=0.05), # } # # # search # rs = RandomizedSearchCV(crf, params_space, # cv=2, # verbose=1, # n_jobs=-1, # n_iter=5) # rs.fit(X_train_current, y_train_current) # # print('best params:', rs.best_params_) # print('best CV score:', rs.best_score_) # crf = rs.best_estimator_ # Train the CRF. crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc( y_test, y_pred) # print(phrase_count, phrase_correct, out_count, out_correct) phrase_acc[num_training] = phrase_correct / phrase_count out_acc[num_training] = out_correct / out_count return phrase_acc, out_acc, label_count
def cv_edit_active_learn(args): # Read the input args. train_idx = args['train_idx'] test_idx = args['test_idx'] dataset = args['dataset'] strings = args['strings'] max_samples_batch = args['max_samples_batch'] batch_size = args['batch_size'] # Initialize arrays to store results. phrase_acc = np.zeros([max_samples_batch + 1]) out_acc = np.zeros([max_samples_batch + 1]) label_count = np.zeros([max_samples_batch + 1]) pseudo_acc = np.zeros([max_samples_batch + 1]) # Define training set and testing set and corresponding original strings. train_set = [dataset[i] for i in train_idx] test_set = [dataset[i] for i in test_idx] train_string = [strings[i] for i in train_idx] test_string = [strings[i] for i in test_idx] # Define an initial actual training set and the training pool (unlabeled data). initial_size = 2 train_set_current = train_set[:initial_size] train_set_new = train_set[initial_size:] train_string_current = train_string[:initial_size] train_string_new = train_string[initial_size:] count = 0 for i in range(initial_size): count += len(train_string[i]) # Obtain testing features and labels. X_test = [sent2features(s) for s in test_set] y_test = [sent2labels(s) for s in test_set] # Train a CRF using the current training set. X_train_current = [sent2features(s) for s in train_set_current] y_train_current = [sent2labels(s) for s in train_set_current] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc( y_test, y_pred) phrase_acc[0] = phrase_correct / phrase_count out_acc[0] = out_correct / out_count label_count[0] = count pseudo_acc[0] = 1 # There is no pseudo-label at the beginning. # Vectorized and clustered test set. num_cluster = 5 total_string = test_string[:] total_string.extend(train_string_new) vec, _ = utils.string_vectorize(total_string) test_vec = vec[:len(test_string)] train_new_vec = vec[len(test_string):].tolist() kmeans = KMeans(n_clusters=num_cluster, random_state=0).fit(test_vec) cluster_centers = kmeans.cluster_centers_ cluster_labels = kmeans.labels_ # Calculate cluster size. cluster_size = np.zeros(num_cluster) for i in cluster_labels: cluster_size[i] += 1 largest_cluster = np.argmax(cluster_size) weight_cluster = [i / sum(cluster_size) for i in cluster_size] # Calculate the representative of each test sample by distance to its corresponding cluster center. len_test = len(test_set) dist_list = np.zeros(len_test) for i in range(len_test): dist_list[i] = np.linalg.norm(test_vec[i] - cluster_centers[cluster_labels[i]]) # Weighted distance to cluster centers for each unlabeled instance. distance_to_cluster = [] for i in range(len(train_new_vec)): weighted_distance = [ weight_cluster[j] * np.linalg.norm(train_new_vec[i] - cluster_centers[j]) for j in range(num_cluster) ] distance_to_cluster.append(sum(weighted_distance)) # len_test = len(test_set) initial_budget = 100 if count >= initial_budget: print('Error: initial budget is less than initial number of labels.') else: label_threshold = initial_budget for num_training in range(max_samples_batch): # Want to look at the model confidence using entropy. # Calculate entropy for each character of each string in the test set. label_list = crf.tagger_.labels() entropy_list = [] for i in test_set: len_ptname = len(i) crf.tagger_.set(sent2features(i)) entropy_seq = [] for j in range(len_ptname): marginal_prob = [ crf.tagger_.marginal(k, j) for k in label_list ] entropy_seq.append(scipy.stats.entropy(marginal_prob)) entropy_list.append(entropy_seq) # Sort the test set based on the average entropy (previously entropy sum). entropy_sum = [sum(i) / len(i) for i in entropy_list] sort_idx_temp = np.argsort(-np.array(entropy_sum), kind='mergesort').tolist() # Select the string with the minimum average distance to the selected group. temp_set = [test_string[i] for i in sort_idx_temp[:3]] distance = utils.avr_edit_distance(temp_set, train_string_new, True) # sort_idx = np.argsort(distance, kind='mergesort').tolist() sort_idx = np.argmin(distance) # Find the sample with the maximal score and only label the part with low confidence/high entropy. y_sequence = crf.tagger_.tag(sent2features( train_set_new[sort_idx])) # generate pseudo-label firstly entropy_tmp = [] len_ptname = len(train_set_new[sort_idx]) for j in range(len_ptname): marginal_prob = [crf.tagger_.marginal(k, j) for k in label_list] entropy_tmp.append(scipy.stats.entropy(marginal_prob)) mean_entropy_tmp = np.mean(entropy_tmp) std_entropy_tmp = np.std(entropy_tmp) z_score = [(entropy_tmp[i] - mean_entropy_tmp) / std_entropy_tmp for i in range(len_ptname)] y_sequence_truth = sent2labels(train_set_new[sort_idx]) # print(entropy_tmp, z_score, y_sequence, y_sequence_truth) label_index = [] pseudo_label_total = 0 pseudo_label_correct = 0 z_score_sort = np.argsort(z_score, kind='mergesort').tolist() for i in range(int(math.ceil(len_ptname / 2.0))): label_index.append(z_score_sort[-i - 1]) if count + len(label_index) <= label_threshold: for i in label_index: count += 1 if y_sequence[i] == sent2labels(train_set_new[sort_idx])[i]: pseudo_label_correct += 1 y_sequence[i] = sent2labels(train_set_new[sort_idx])[i] pseudo_label_total += 1 else: label_threshold_tmp = label_threshold - count sorted_z_score_index = np.argsort(z_score, kind='mergesort').tolist() for i in range(label_threshold_tmp): count += 1 if y_sequence[sorted_z_score_index[-i - 1]] == sent2labels( train_set_new[sort_idx])[sorted_z_score_index[-i - 1]]: pseudo_label_correct += 1 y_sequence[sorted_z_score_index[-i - 1]] = sent2labels( train_set_new[sort_idx])[sorted_z_score_index[-i - 1]] pseudo_label_total += 1 if count == label_threshold: label_threshold = label_threshold + 50 label_count[num_training + 1] = count if pseudo_label_total != 0: pseudo_acc[num_training + 1] = pseudo_label_correct / pseudo_label_total else: pseudo_acc[num_training + 1] = 1 # Update training set. # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]] sample_to_remove = [train_set_new[sort_idx]] for i in sample_to_remove: train_set_current.append(i) train_set_new.remove(i) X_train_current.append(sent2features(i)) y_train_current.append(y_sequence) # print(X_train_current) # string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]] string_to_remove = [train_string_new[sort_idx]] for i in string_to_remove: train_string_current.append(i) train_string_new.remove(i) # Remove the pre-calculate vectors and distances. del train_new_vec[sort_idx] del distance_to_cluster[sort_idx] # # define fixed parameters and parameters to search # crf = sklearn_crfsuite.CRF( # algorithm='lbfgs', # max_iterations=100, # all_possible_transitions=True # ) # params_space = { # 'c1': scipy.stats.expon(scale=0.5), # 'c2': scipy.stats.expon(scale=0.05), # } # # # search # rs = RandomizedSearchCV(crf, params_space, # cv=2, # verbose=1, # n_jobs=-1, # n_iter=5) # rs.fit(X_train_current, y_train_current) # # print('best params:', rs.best_params_) # print('best CV score:', rs.best_score_) # crf = rs.best_estimator_ # Train the CRF. crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc( y_test, y_pred) phrase_acc[num_training + 1] = phrase_correct / phrase_count out_acc[num_training + 1] = out_correct / out_count return phrase_acc, out_acc, label_count, pseudo_acc
def cv_edit_active_learn(args): # Read the input args. train_idx = args['train_idx'] test_idx = args['test_idx'] dataset = args['dataset'] strings = args['strings'] max_samples_batch = args['max_samples_batch'] batch_size = args['batch_size'] # Initialize arrays to store results. phrase_acc = np.zeros([max_samples_batch + 1]) out_acc = np.zeros([max_samples_batch + 1]) label_count = np.zeros([max_samples_batch + 1]) pseudo_acc = np.zeros([max_samples_batch + 1]) # Define training set and testing set and corresponding original strings. train_set = [dataset[i] for i in train_idx] test_set = [dataset[i] for i in test_idx] train_string = [strings[i] for i in train_idx] test_string = [strings[i] for i in test_idx] # Define an initial actual training set and the training pool (unlabeled data). initial_size = 10 train_set_current = train_set[:initial_size] train_set_new = train_set[initial_size:] train_string_current = train_string[:initial_size] train_string_new = train_string[initial_size:] count = 0 for i in range(initial_size): count += len(train_string[i]) # Obtain testing features and labels. X_test = [sent2features(s) for s in test_set] y_test = [sent2labels(s) for s in test_set] # Train a CRF using the current training set. X_train_current = [sent2features(s) for s in train_set_current] y_train_current = [sent2labels(s) for s in train_set_current] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc( y_test, y_pred) phrase_acc[0] = phrase_correct / phrase_count out_acc[0] = out_correct / out_count label_count[0] = count pseudo_acc[0] = 1 # There is no pseudo-label at the beginning. # Vectorized and clustered test set. total_string = test_string[:] total_string.extend(train_string_new) vec, _ = utils.string_vectorize(total_string) test_vec = vec[:len(test_string)].tolist() train_new_vec = vec[len(test_string):].tolist() # Pre-calculate similarity. # This will be efficient if the number of iterations is large. sim_matrix = np.zeros((len(train_new_vec), len(test_vec))) for i in range(len(train_new_vec)): for j in range(len(test_vec)): sim_matrix[i, j] = 1 - spatial.distance.cosine( train_new_vec[i], test_vec[j]) len_test = len(test_set) initial_budget = 100 if count >= initial_budget: print('Error: initial budget is less than initial number of labels.') else: label_threshold = initial_budget new_instance_idx = [ ] # record the indices of new added instances in the training set pseudo_label_idx = [] # record the positions of pseudo labels visited_idx = [ ] # record the indices of visited instances in the unlabeled set for num_training in range(max_samples_batch): label_list = crf.tagger_.labels() # # Want to look at the model confidence on the test set. # entropy_list = [] # for i in test_set: # len_ptname = len(i) # crf.tagger_.set(sent2features(i)) # entropy_seq = [] # for j in range(len_ptname): # marginal_prob = [crf.tagger_.marginal(k, j) for k in label_list] # entropy_seq.append(scipy.stats.entropy(marginal_prob)) # entropy_list.append(entropy_seq) # # # Sort the test set based on the average entropy. # entropy_sum = [sum(i)/len(i) for i in entropy_list] # sort_idx_temp = np.argsort(-np.array(entropy_sum), kind='mergesort').tolist() # Calculate the confidence on the test set using the current CRF. prob_list = [] for i in range(len_test): # crf.tagger_.set(X_train_new[i]) y_sequence = crf.tagger_.tag(X_test[i]) # print(crf.tagger_.probability(y_sequence)) # normalized sequence probability prob_norm = math.exp( math.log(crf.tagger_.probability(y_sequence)) / len(test_string[i])) prob_list.append(prob_norm) # Sort the test set based on confidence. sort_idx_temp = np.argsort(np.array(prob_list), kind='mergesort').tolist() # Calculate the average similarity between the unlabeled samples and the selected test samples. group_size = 1 avr_sim = np.sum(sim_matrix[:, sort_idx_temp[:group_size]], axis=1) / group_size distance = avr_sim # We want to have information weighted by such distance. entropy_list = [] len_train_new = len(train_set_new) for i in range(len_train_new): crf.tagger_.set(sent2features(train_set_new[i])) entropy_seq = [] len_ptname = len(train_set_new[i]) if i in visited_idx: revisit_idx_re = visited_idx.index(i) unlabeled_part = pseudo_label_idx[revisit_idx_re] for j in unlabeled_part: marginal_prob = [ crf.tagger_.marginal(k, j) for k in label_list ] entropy_seq.append(scipy.stats.entropy(marginal_prob)) else: for j in range(len_ptname): marginal_prob = [ crf.tagger_.marginal(k, j) for k in label_list ] entropy_seq.append(scipy.stats.entropy(marginal_prob)) entropy_list.append(entropy_seq) entropy_list_mean = [] for i in range(len(entropy_list)): entropy_list_mean.append( sum(entropy_list[i]) / len(entropy_list[i])) candidate_score = [] for i in range(len_train_new): if distance[i] == 0: candidate_score.append(sys.float_info.max) else: candidate_score.append(entropy_list_mean[i] * distance[i]) # Obtain the candidate index. sort_idx = np.argsort(candidate_score, kind='mergesort').tolist() sort_idx.reverse() sort_idx = sort_idx[0] # Check if this is revisiting. if sort_idx in visited_idx: revisit_flag = True else: revisit_flag = False if revisit_flag: revisit_idx_un = sort_idx # the instance index in the unlabeled set revisit_idx_re = visited_idx.index( sort_idx) # the instance index in the tracking record revisit_idx_tr = new_instance_idx[ revisit_idx_re] # the instance index in the training set # Update the pseudo label to manual label in the training set. y_train_current[revisit_idx_tr] = sent2labels( train_set_current[revisit_idx_tr]) # Update the unlabeled set. del train_set_new[revisit_idx_un] del train_string_new[revisit_idx_un] del train_new_vec[revisit_idx_un] sim_matrix = np.delete(sim_matrix, revisit_idx_un, 0) # Update the tracking record. count += len(pseudo_label_idx[revisit_idx_re]) del new_instance_idx[revisit_idx_re] del pseudo_label_idx[revisit_idx_re] del visited_idx[revisit_idx_re] for i in range(len(visited_idx)): if visited_idx[i] > revisit_idx_un: visited_idx[i] = visited_idx[i] - 1 label_count[num_training + 1] = count else: # Exhausted search through all substrings. # Search substrings with length 2 to len_ptname. visited_idx.append(sort_idx) y_sequence = crf.tagger_.tag(sent2features( train_set_new[sort_idx])) # generate pseudo-label firstly candidate_entropy_list = [] len_ptname = len(train_set_new[sort_idx]) for j in range(len_ptname): marginal_prob = [ crf.tagger_.marginal(k, j) for k in label_list ] candidate_entropy_list.append( scipy.stats.entropy(marginal_prob)) # sorted_marginal_prob = np.sort(marginal_prob, kind='mergesort').tolist() # sorted_marginal_prob.reverse() # candidate_entropy_list.append(sorted_marginal_prob[0]-sorted_marginal_prob[1]) substring_score = {} for i in range(len_ptname - 1): for j in range( i + 2, len_ptname ): # should be len_ptname+1 if want to include full string selected_entropy = sum( candidate_entropy_list[i:j]) / (j - i) rest_entropy = (sum(candidate_entropy_list) - sum( candidate_entropy_list[i:j])) / (len_ptname - (j - i)) substring_score[(i, j)] = selected_entropy - rest_entropy # Rank the substrings based on their scores in descending order. sorted_substring_score = sorted(substring_score.items(), key=operator.itemgetter(1)) sorted_substring_score.reverse() index_tuple1 = sorted_substring_score[0][0] index_tuple2 = sorted_substring_score[1][0] index_tuple3 = sorted_substring_score[2][0] label_index1 = [] label_index2 = [] label_index3 = [] for i in range(index_tuple1[0], index_tuple1[1]): label_index1.append(i) for i in range(index_tuple2[0], index_tuple2[1]): label_index2.append(i) for i in range(index_tuple3[0], index_tuple3[1]): label_index3.append(i) label_index = list(set(label_index1 + label_index2 + label_index3)) pseudo_index = [ i for i in range(len_ptname) if i not in label_index ] pseudo_label_idx.append(pseudo_index) # print(label_index, pseudo_index, train_string_new[sort_idx], y_sequence) # Apply pseudo-labeling. y_sequence_truth = sent2labels(train_set_new[sort_idx]) pseudo_label_total = 0 pseudo_label_correct = 0 for i in label_index: count += 1 if y_sequence[i] == y_sequence_truth[i]: pseudo_label_correct += 1 y_sequence[i] = y_sequence_truth[i] pseudo_label_total += 1 label_count[num_training + 1] = count if pseudo_label_total != 0: pseudo_acc[num_training + 1] = pseudo_label_correct / pseudo_label_total else: pseudo_acc[num_training + 1] = 1 # Update training set. new_instance_idx.append(len(train_string_current)) train_set_current.append(train_set_new[sort_idx]) train_string_current.append(train_string_new[sort_idx]) # X_train_current.append(sent2features(train_set_new[sort_idx])) y_train_current.append(y_sequence) X_train_current = [sent2features(s) for s in train_set_current] # del train_set_new[sort_idx] # del train_string_new[sort_idx] # del train_new_vec[sort_idx] # sim_matrix = np.delete(sim_matrix, sort_idx, 0) # Update the pseudo labels using the current CRF. new_instance_count = 0 for i in new_instance_idx: current_label_seq = y_train_current[i] new_pseudo_label_seq = crf.tagger_.tag(X_train_current[i]) for j in pseudo_label_idx[new_instance_count]: current_label_seq[j] = new_pseudo_label_seq[j] y_train_current[i] = current_label_seq new_instance_count += 1 # Train the CRF. crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc( y_test, y_pred) phrase_acc[num_training + 1] = phrase_correct / phrase_count out_acc[num_training + 1] = out_correct / out_count return phrase_acc, out_acc, label_count, pseudo_acc
def cv_edit_active_learn(args): # Read the input args. train_idx = args['train_idx'] test_idx = args['test_idx'] dataset = args['dataset'] strings = args['strings'] max_samples_batch = args['max_samples_batch'] batch_size = args['batch_size'] phrase_acc = np.zeros([max_samples_batch + 1]) out_acc = np.zeros([max_samples_batch + 1]) label_count = np.zeros([max_samples_batch + 1]) # Define training set and testing set. train_set = [dataset[i] for i in train_idx] test_set = [dataset[i] for i in test_idx] train_string = [strings[i] for i in train_idx] test_string = [strings[i] for i in test_idx] # Define an initial actual training set from the training pool. initial_size = 10 train_set_current = train_set[:initial_size] train_set_new = train_set[initial_size:] train_string_current = train_string[:initial_size] train_string_new = train_string[initial_size:] for i in range(initial_size): label_count[0] += len(train_string[i]) # Obtain testing features and labels. X_test = [sent2features(s) for s in test_set] y_test = [sent2labels(s) for s in test_set] # Train a CRF using the current training set. X_train_current = [sent2features(s) for s in train_set_current] y_train_current = [sent2labels(s) for s in train_set_current] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc( y_test, y_pred) phrase_acc[0] = phrase_correct / phrase_count out_acc[0] = out_correct / out_count # Vectorize the unlabeled set. vec, _ = utils.string_vectorize(train_string_new) vec = vec.tolist() # Pre-calculate similarity. ini_unlabeled_size = len(vec) sim_matrix = np.zeros((ini_unlabeled_size, ini_unlabeled_size)) for i in range(ini_unlabeled_size): for j in range(i): sim_matrix[i, j] = 1 - spatial.distance.cosine(vec[i], vec[j]) sim_matrix[j, i] = sim_matrix[i, j] for num_training in range(max_samples_batch): # Calculate the confidence on the training pool (train_set_new) using the current CRF. X_train_new = [sent2features(s) for s in train_set_new] len_train_new = len(train_set_new) prob_list = [] for i in range(len_train_new): #crf.tagger_.set(X_train_new[i]) y_sequence = crf.tagger_.tag(X_train_new[i]) #print(crf.tagger_.probability(y_sequence)) # normalized sequence probability prob_norm = math.exp( math.log(crf.tagger_.probability(y_sequence)) / len(train_string_new[i])) prob_list.append(1 - prob_norm) # Calculate the average similarity to all other unlabeled sample. sim_list = np.sum(sim_matrix, axis=0) / len_train_new # Calculate information density. info_den = [prob_list[i] * sim_list[i] for i in range(len_train_new)] # Sort the training pool based on confidence. sort_idx = np.argsort(-np.array(info_den), kind='mergesort').tolist() # if (num_training>=20)&(num_training<=40): # print([train_string_new[i] for i in sort_idx[:batch_size]]) # update training set label_count[num_training + 1] = label_count[num_training] + len( train_set_new[sort_idx[0]]) # assume batch_size = 1 # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]] # for i in sample_to_remove: # train_set_current.append(i) # train_set_new.remove(i) idx_to_remove = sort_idx[:batch_size] idx_to_remove = np.sort(idx_to_remove, kind='mergesort').tolist() for i in range(batch_size): sim_matrix = np.delete(sim_matrix, idx_to_remove[-i - 1], 0) sim_matrix = np.delete(sim_matrix, idx_to_remove[-i - 1], 1) train_set_current.append(train_set_new[idx_to_remove[-i - 1]]) del train_set_new[idx_to_remove[-i - 1]] # Obtain current training features. X_train_current = [sent2features(s) for s in train_set_current] y_train_current = [sent2labels(s) for s in train_set_current] # Train the CRF. crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc( y_test, y_pred) # print(phrase_count, phrase_correct, out_count, out_correct) phrase_acc[num_training + 1] = phrase_correct / phrase_count out_acc[num_training + 1] = out_correct / out_count return phrase_acc, out_acc, label_count
def cv_edit_active_learn(args): # Read the input args. train_idx = args['train_idx'] test_idx = args['test_idx'] dataset = args['dataset'] strings = args['strings'] max_samples_batch = args['max_samples_batch'] batch_size = args['batch_size'] # Initialize arrays to store results. phrase_acc = np.zeros([max_samples_batch]) out_acc = np.zeros([max_samples_batch]) label_count = np.zeros([max_samples_batch]) pseudo_acc = np.zeros([max_samples_batch]) count = 0 # Define training set and testing set and corresponding original strings. train_set = [dataset[i] for i in train_idx] test_set = [dataset[i] for i in test_idx] train_string = [strings[i] for i in train_idx] test_string = [strings[i] for i in test_idx] # Define an initial actual training set and the training pool (unlabeled data). train_set_current = train_set[:2] train_set_new = train_set[2:] train_string_current = train_string[:2] train_string_new = train_string[2:] # Obtain testing features and labels. X_test = [sent2features(s) for s in test_set] y_test = [sent2labels(s) for s in test_set] # Train a CRF using the current training set. X_train_current = [sent2features(s) for s in train_set_current] y_train_current = [sent2labels(s) for s in train_set_current] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Vectorized and clustered test set. num_cluster = 5 total_string = test_string[:] total_string.extend(train_string_new) vec, _ = utils.string_vectorize(total_string) test_vec = vec[:len(test_string)] train_new_vec = vec[len(test_string):].tolist() kmeans = KMeans(n_clusters=num_cluster, random_state=0).fit(test_vec) cluster_centers = kmeans.cluster_centers_ cluster_labels = kmeans.labels_ # Calculate cluster size. cluster_size = np.zeros(num_cluster) for i in cluster_labels: cluster_size[i] += 1 largest_cluster = np.argmax(cluster_size) weight_cluster = [i / sum(cluster_size) for i in cluster_size] # Calculate the representative of each test sample by distance to its corresponding cluster center. len_test = len(test_set) dist_list = np.zeros(len_test) for i in range(len_test): dist_list[i] = np.linalg.norm(test_vec[i] - cluster_centers[cluster_labels[i]]) # Weighted distance to cluster centers for each unlabeled instance. distance_to_cluster = [] for i in range(len(train_new_vec)): weighted_distance = [ weight_cluster[j] * np.linalg.norm(train_new_vec[i] - cluster_centers[j]) for j in range(num_cluster) ] distance_to_cluster.append(sum(weighted_distance)) len_test = len(test_set) len_ptname = len(test_set[0]) labeled_instance = [ ] # Store the indices of partial labeled instances in the unlabeled set labeled_positions = [ ] # Store the manual labeled positions for partial labeled instances labeled_istance_train = [ ] # Store the location where the instance added to (the training set) for num_training in range(max_samples_batch): # Want to look at the model confidence using entropy. # Calculate entropy for each character of each string in the test set. label_list = crf.tagger_.labels() entropy_list = [] for i in test_set: crf.tagger_.set(sent2features(i)) entropy_seq = [] for j in range(len_ptname): marginal_prob = [ crf.tagger_.marginal(k, j) for k in label_list ] entropy_seq.append(scipy.stats.entropy(marginal_prob)) entropy_list.append(entropy_seq) # Sort the test set based on the entropy sum. entropy_sum = [sum(i) for i in entropy_list] sort_idx_temp = np.argsort(-np.array(entropy_sum), kind='mergesort').tolist() # Select the string with the minimum average distance to the selected group. temp_set = [test_string[i] for i in sort_idx_temp[:2]] distance = utils.avr_edit_distance(temp_set, train_string_new, True) # sorted_idx = np.argsort(distance, kind='mergesort').tolist() sort_idx = np.argmin(distance) # Store the index of the selected instance. if sort_idx not in labeled_instance: labeled_instance.append(sort_idx) revisit_flag = False else: revisit_flag = True # Only label the part with low confidence/high entropy. y_sequence = crf.tagger_.tag(sent2features( train_set_new[sort_idx])) # generate pseudo-label firstly entropy_tmp = [] for j in range(len_ptname): marginal_prob = [crf.tagger_.marginal(k, j) for k in label_list] entropy_tmp.append(scipy.stats.entropy(marginal_prob)) y_sequence_truth = sent2labels(train_set_new[sort_idx]) tmp_position = [] tmp_count = 0 if revisit_flag: tmp_position = labeled_positions[labeled_instance.index(sort_idx)] entropy_tmp_revise = [] for i in range(len_ptname): if i not in tmp_position: entropy_tmp_revise.append(entropy_tmp[i]) mean_entropy_tmp = np.mean(entropy_tmp_revise) std_entropy_tmp = np.std(entropy_tmp_revise) if len(entropy_tmp_revise) == 1: z_score_revise = [100] else: z_score_revise = [(entropy_tmp_revise[i] - mean_entropy_tmp) / std_entropy_tmp for i in range(len(entropy_tmp_revise))] z_score = [] j = 0 for i in range(len_ptname): if i in tmp_position: z_score.append(-100) else: z_score.append(z_score_revise[j]) j = j + 1 train_position = labeled_istance_train[labeled_instance.index( sort_idx)] y_sequence = y_train_current[train_position] for i in range(len_ptname): if i not in tmp_position: if z_score[i] > 0: count += 1 tmp_count += 1 y_sequence[i] = y_sequence_truth[i] tmp_position.append(i) # Check if no character is labeled. if tmp_count == 0: for i in range(len_ptname): if i not in tmp_position: count += 1 tmp_count += 1 y_sequence[i] = y_sequence_truth[i] tmp_position.append(i) break # Sort the tmp_position. tmp_position = np.sort(tmp_position, kind='mergesort').tolist() labeled_positions[labeled_instance.index(sort_idx)] = tmp_position else: mean_entropy_tmp = np.mean(entropy_tmp) std_entropy_tmp = np.std(entropy_tmp) z_score = [(entropy_tmp[i] - mean_entropy_tmp) / std_entropy_tmp for i in range(len_ptname)] for i in range(len_ptname): if z_score[i] > 0: count += 1 tmp_count += 1 y_sequence[i] = y_sequence_truth[i] tmp_position.append(i) labeled_positions.append(tmp_position) label_count[num_training] = count # Update training set. if revisit_flag: y_train_current[train_position] = y_sequence else: # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]] sample_to_remove = [train_set_new[sort_idx]] for i in sample_to_remove: train_set_current.append(i) #train_set_new.remove(i) X_train_current.append(sent2features(i)) y_train_current.append(y_sequence) # print(X_train_current) # string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]] string_to_remove = [train_string_new[sort_idx]] for i in string_to_remove: train_string_current.append(i) labeled_istance_train.append(len(train_string_current) - 1) # train_string_new.remove(i) # Remove the pre-calculate vectors and distances. #del train_new_vec[sort_idx] #del distance_to_cluster[sort_idx] # Remove the full labeled instances from the unlabeled set. tmp_idx_record = [] for i in range(len(labeled_positions)): if len(labeled_positions[i]) == len_ptname: tmp_idx_record.append(i) idx_record = [] for i in range(len(tmp_idx_record)): j = len(tmp_idx_record) - i - 1 idx_record.append(labeled_instance[tmp_idx_record[j]]) del labeled_instance[j] del labeled_positions[j] del labeled_istance_train[j] idx_record = np.sort(idx_record, kind='mergesort').tolist() for i in range(len(idx_record)): j = len(idx_record) - i - 1 del train_set_new[j] del train_string_new[j] # Train the CRF. crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc( y_test, y_pred) # print(phrase_count, phrase_correct, out_count, out_correct) phrase_acc[num_training] = phrase_correct / phrase_count out_acc[num_training] = out_correct / out_count return phrase_acc, out_acc, label_count
def cv_edit_active_learn(args): # Read the input args. train_idx = args['train_idx'] test_idx = args['test_idx'] dataset = args['dataset'] strings = args['strings'] max_samples_batch = args['max_samples_batch'] batch_size = args['batch_size'] phrase_acc = np.zeros([max_samples_batch + 1]) out_acc = np.zeros([max_samples_batch + 1]) label_count = np.zeros([max_samples_batch + 1]) # Define training set and testing set. train_set = [dataset[i] for i in train_idx] test_set = [dataset[i] for i in test_idx] train_string = [strings[i] for i in train_idx] test_string = [strings[i] for i in test_idx] # Define an initial actual training set from the training pool. initial_size = 10 train_set_current = train_set[:initial_size] train_set_new = train_set[initial_size:] train_string_current = train_string[:initial_size] train_string_new = train_string[initial_size:] for i in range(initial_size): label_count[0] += len(train_string[i]) # Obtain testing features and labels. X_test = [sent2features(s) for s in test_set] y_test = [sent2labels(s) for s in test_set] # Train a CRF using the current training set. X_train_current = [sent2features(s) for s in train_set_current] y_train_current = [sent2labels(s) for s in train_set_current] crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True ) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(y_test, y_pred) phrase_acc[0] = phrase_correct / phrase_count out_acc[0] = out_correct / out_count # Vectorized and clustered test set. total_string = test_string[:] total_string.extend(train_string_new) vec, _ = utils.string_vectorize(total_string) test_vec = vec[:len(test_string)].tolist() train_new_vec = vec[len(test_string):].tolist() # Pre-calculate similarity. # This will be efficient if the number of iterations is large. sim_matrix = np.zeros((len(train_new_vec), len(test_vec))) for i in range(len(train_new_vec)): for j in range(len(test_vec)): sim_matrix[i, j] = 1 - spatial.distance.cosine(train_new_vec[i], test_vec[j]) len_test = len(test_set) for num_training in range(max_samples_batch): # Calculate the confidence on the testing set using the current CRF. prob_list = [] for i in range(len_test): # crf.tagger_.set(X_train_new[i]) y_sequence = crf.tagger_.tag(X_test[i]) # print(crf.tagger_.probability(y_sequence)) # normalized sequence probability prob_norm = math.exp(math.log(crf.tagger_.probability(y_sequence)) / len(test_string[i])) prob_list.append(prob_norm) # Sort the test set based on confidence. sort_idx_temp = np.argsort(np.array(prob_list), kind='mergesort').tolist() # Calculate the average similarity between the unlabeled samples and the selected test samples. # temp_set = [test_string[i] for i in sort_idx_temp[:5]] # distance = utils.avr_edit_distance(temp_set, train_string_new, True) group_size = 80 avr_sim = np.sum(sim_matrix[:, sort_idx_temp[:group_size]], axis=1)/group_size distance = avr_sim # We want to have information weighted by such distance. X_train_new = [sent2features(s) for s in train_set_new] len_train_new = len(train_set_new) prob_list_candidate = [] for i in range(len_train_new): y_sequence = crf.tagger_.tag(X_train_new[i]) prob_norm = math.exp(math.log(crf.tagger_.probability(y_sequence)) / len(train_string_new[i])) prob_list_candidate.append(prob_norm) candidate_score = [] for i in range(len_train_new): if distance[i] == 0: candidate_score.append(sys.float_info.max) else: candidate_score.append(prob_list_candidate[i] / distance[i]) sort_idx = np.argsort(candidate_score, kind='mergesort').tolist() # if (num_training>=20)&(num_training<=40): # print([train_string_new[i] for i in sort_idx[:batch_size]]) # Assume the batch size is 1. label_count[num_training + 1] = label_count[num_training] + len(train_set_new[sort_idx[0]]) # update training set # sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]] # for i in sample_to_remove: # train_set_current.append(i) # train_set_new.remove(i) # string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]] # for i in string_to_remove: # train_string_current.append(i) # train_string_new.remove(i) idx_to_remove = sort_idx[:batch_size] idx_to_remove = np.sort(idx_to_remove, kind='mergesort').tolist() for i in range(batch_size): sim_matrix = np.delete(sim_matrix, idx_to_remove[-i-1], 0) train_set_current.append(train_set_new[idx_to_remove[-i-1]]) del train_set_new[idx_to_remove[-i-1]] train_string_current.append(train_string_new[idx_to_remove[-i-1]]) del train_string_new[idx_to_remove[-i-1]] # Obtain current training features. X_train_current = [sent2features(s) for s in train_set_current] y_train_current = [sent2labels(s) for s in train_set_current] # # define fixed parameters and parameters to search # crf = sklearn_crfsuite.CRF( # algorithm='lbfgs', # max_iterations=100, # all_possible_transitions=True # ) # params_space = { # 'c1': scipy.stats.expon(scale=0.5), # 'c2': scipy.stats.expon(scale=0.05), # } # # # search # rs = RandomizedSearchCV(crf, params_space, # cv=2, # verbose=1, # n_jobs=-1, # n_iter=5) # rs.fit(X_train_current, y_train_current) # # print('best params:', rs.best_params_) # print('best CV score:', rs.best_score_) # crf = rs.best_estimator_ # Train the CRF. crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True ) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) # print('Iteration: ', num_training) # for i in range(10): # print('\nstring: ', test_string[i]) # print('predction and ground truth:') # print(y_pred[i]) # print(y_test[i]) # print('\n') # print('\n') phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc(y_test, y_pred) # print(phrase_count, phrase_correct, out_count, out_correct) phrase_acc[num_training+1] = phrase_correct / phrase_count out_acc[num_training+1] = out_correct / out_count return phrase_acc, out_acc, label_count
def cv_edit_active_learn(args): # Read the input args. train_idx = args['train_idx'] test_idx = args['test_idx'] dataset = args['dataset'] strings = args['strings'] max_samples_batch = args['max_samples_batch'] batch_size = args['batch_size'] phrase_acc = np.zeros([max_samples_batch]) out_acc = np.zeros([max_samples_batch]) # Define training set and testing set. train_set = [dataset[i] for i in train_idx] test_set = [dataset[i] for i in test_idx] train_string = [strings[i] for i in train_idx] test_string = [strings[i] for i in test_idx] # Define an initial actual training set from the training pool. train_set_current = train_set[:2] train_set_new = train_set[2:] train_string_current = train_string[:2] train_string_new = train_string[2:] # Obtain testing features and labels. X_test = [sent2features(s) for s in test_set] y_test = [sent2labels(s) for s in test_set] # Train a CRF using the current training set. X_train_current = [sent2features(s) for s in train_set_current] y_train_current = [sent2labels(s) for s in train_set_current] crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Vectorized and clustered test set. num_cluster = 5 total_string = test_string[:] total_string.extend(train_string_new) vec, _ = utils.string_vectorize(total_string) test_vec = vec[:len(test_string)] train_new_vec = vec[len(test_string):].tolist() kmeans = KMeans(n_clusters=num_cluster, random_state=0).fit(test_vec) cluster_centers = kmeans.cluster_centers_ cluster_labels = kmeans.labels_ # Calculate cluster size. cluster_size = np.zeros(num_cluster) for i in cluster_labels: cluster_size[i] += 1 largest_cluster = np.argmax(cluster_size) weight_cluster = [i / sum(cluster_size) for i in cluster_size] # Calculate the representative of each test sample by distance to its corresponding cluster center. len_test = len(test_set) dist_list = np.zeros(len_test) for i in range(len_test): dist_list[i] = np.linalg.norm(test_vec[i] - cluster_centers[cluster_labels[i]]) distance_to_cluster = [] for i in range(len(train_new_vec)): weighted_distance = [ weight_cluster[j] * np.linalg.norm(train_new_vec[i] - cluster_centers[j]) for j in range(num_cluster) ] distance_to_cluster.append(sum(weighted_distance)) for num_training in range(max_samples_batch): # Calculate the confidence on the unlabeled set using the current CRF. len_new = len(train_string_new) train_new_prob_list = np.zeros(len_new) for i in range(len_new): y_sequence = crf.tagger_.tag(sent2features(train_string_new[i])) train_new_prob_list[i] = crf.tagger_.probability(y_sequence) # # Construct a new indicator (confidence and representative) to pick out a sample from the test set. # test_indicator = [i[0] for i in zip(test_prob_list, dist_list)] # # # Sort the test set based on the new indicator. # sort_idx_temp = np.argsort(np.array(test_indicator), kind='mergesort').tolist() # # # Calculate the distance from unlabeled samples to the selected test sample(s). # tmp_set = [test_vec[i] for i in sort_idx_temp[:1]] # distance = np.zeros(len(train_new_vec)) # for i in range(len(train_new_vec)): # tmp_distance = [np.linalg.norm(train_new_vec[i] - j) for j in tmp_set] # distance[i] = np.average(tmp_distance) # # Calculate the confidence on the unlabeled samples. # train_prob_list = [] # len_unlabeled = len(train_set_new) # X_train_new = [sent2features(s) for s in train_set_new] # for i in range(len_unlabeled): # y_sequence = crf.tagger_.tag(X_train_new[i]) # train_prob_list.append(crf.tagger_.probability(y_sequence)) # # # Construct a new indicator (confidence and distance) to pick out unlabeled samples. # train_indicator = [i[0]*i[1] for i in zip(train_prob_list, distance)] train_indicator = [ i[0] / i[1] for i in zip(train_new_prob_list, distance_to_cluster) ] # Sort the unlabeled samples based on the new indicator. sort_idx = np.argsort(train_indicator, kind='mergesort').tolist() # if (num_training>=20)&(num_training<=40): # print([train_string_new[i] for i in sort_idx[:batch_size]]) # update training set sample_to_remove = [train_set_new[i] for i in sort_idx[:batch_size]] for i in sample_to_remove: train_set_current.append(i) train_set_new.remove(i) string_to_remove = [train_string_new[i] for i in sort_idx[:batch_size]] for i in string_to_remove: train_string_current.append(i) train_string_new.remove(i) idx_for_delete = np.sort(sort_idx[:batch_size]) for i in range(1, batch_size + 1, 1): del train_new_vec[idx_for_delete[-i]] del distance_to_cluster[idx_for_delete[-i]] # Obtain current training features. X_train_current = [sent2features(s) for s in train_set_current] y_train_current = [sent2labels(s) for s in train_set_current] # # define fixed parameters and parameters to search # crf = sklearn_crfsuite.CRF( # algorithm='lbfgs', # max_iterations=100, # all_possible_transitions=True # ) # params_space = { # 'c1': scipy.stats.expon(scale=0.5), # 'c2': scipy.stats.expon(scale=0.05), # } # # # search # rs = RandomizedSearchCV(crf, params_space, # cv=2, # verbose=1, # n_jobs=-1, # n_iter=5) # rs.fit(X_train_current, y_train_current) # # print('best params:', rs.best_params_) # print('best CV score:', rs.best_score_) # crf = rs.best_estimator_ # Train the CRF. crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True) crf.fit(X_train_current, y_train_current) # Use the estimator. y_pred = crf.predict(X_test) phrase_count, phrase_correct, out_count, out_correct = utils.phrase_acc( y_test, y_pred) # print(phrase_count, phrase_correct, out_count, out_correct) phrase_acc[num_training] = phrase_correct / phrase_count out_acc[num_training] = out_correct / out_count return phrase_acc, out_acc