def __init__(self, cnn_model_path, source_path, target_path, vocab_path, sent_len, labeled_save_dir): """ :param cnn_model_path: Path to a trained cnn model. :param source_path: Path to instance data, the latter part of which will be labeled during active learning. :param target_path: Path to labels for already labeled part of the data. :param vocab_path: Path to vocab file. :param labeled_save_dir: Directory to which the labeled files will be stored. """ unlabeled_data = util.read_data_unlabeled_part(source_path, target_path, sent_len, shuffle=False) self.unlabeled_data = np.array(unlabeled_data) self.data_size = self.unlabeled_data.shape[0] self.labeled_data, self.labeled_result = util.read_data_labeled_part( source_path, target_path, sent_len, shuffle=False) sentence_indices_input = self.unlabeled_data[:, :-2] self.vocab_path = vocab_path _, rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path) self.sentence_input = preprocessing_util.indices_to_sentences( sentence_indices_input, rev_vocab) self.kp_indices_input = self.unlabeled_data[:, -2:] for i, sentence in enumerate(self.sentence_input): # Label the key phrases of interest in the current sentence with *. sentence[self.kp_indices_input[i, 0]] += '*' sentence[self.kp_indices_input[i, 1]] += '*' self.update_labeled_save_dir(labeled_save_dir) # self.labeled_save_dir = labeled_save_dir # self.source_save_dir = os.path.join(labeled_save_dir, 'test_cs_unlabeled_data_combined.txt') # self.target_save_dir = os.path.join(labeled_save_dir, 'test_cs_labels_combined.txt') # self.vocab_save_dir = os.path.join(labeled_save_dir, 'test_cs_vocab_combined') label_config = util.load_from_dump( os.path.join(cnn_model_path, 'flags.cPickle')) label_config['train_dir'] = cnn_model_path _, predicted_label = label(self.unlabeled_data, config=label_config) assert predicted_label.shape[0] == self.data_size predicted_label_exp = np.exp(predicted_label) predicted_label_softmax = predicted_label_exp / np.sum( predicted_label_exp, axis=1, keepdims=True) # Entropy = -sum(p * log p) so this is actually the negative of entropy. For sorting purpose I took out the neg. predicted_label_entropy = np.sum(np.multiply( predicted_label_softmax, np.log(predicted_label_softmax)), axis=1) # The following are ways to rank what question should be asked first. # The first one uses entropy, but there might be some implementation errors. self.predicted_label_entropy_argsort = np.argsort( predicted_label_entropy, axis=0).tolist() pass
def main(argv=None): restore_param = util.load_from_dump( os.path.join(FLAGS.train_dir, 'flags.cPickle')) restore_param['train_dir'] = FLAGS.train_dir source_path = os.path.join(restore_param['data_dir'], 'test_cs_unlabeled_data_combined.txt') target_path = os.path.join(restore_param['data_dir'], 'test_cs_labels_combined.txt') vocab_path = os.path.join(restore_param['data_dir'], 'test_cs_vocab_combined') data = util.read_data_unlabeled_part(source_path, target_path, restore_param['sent_len']) # # Now hard code to take the first 100 # data = data[:100] # data = data[-1000:] x_input, actual_output = label(data, restore_param) actual_output_exp = np.exp(actual_output) actual_output_softmax = actual_output_exp / np.sum( actual_output_exp, axis=1, keepdims=True) actual_output_argmax = np.argmax(actual_output_softmax, axis=1) actual_output_softmax_sorted = np.argsort( -np.max(actual_output_softmax[..., :2], axis=1)).tolist() sentence_indices_input = x_input[:, :-2] _, rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path) sentence_input = preprocessing_util.indices_to_sentences( sentence_indices_input, rev_vocab) kp_indices_input = x_input[:, -2:] print('Type\tSentence\t\tProbability [A is-a B, B is-a A, Neither]') # for sentence_i, sentence in enumerate(sentence_input): for sentence_i in actual_output_softmax_sorted: sentence = sentence_input[sentence_i] # Label the key phrases of interest in the current sentence with *. sentence[kp_indices_input[sentence_i, 1]] += '*' sentence[kp_indices_input[sentence_i, 0]] += '*' if actual_output_argmax[sentence_i] == 2: # current_type = 'Neither' break if actual_output_argmax[sentence_i] == 0: current_type = 'A is-a B' elif actual_output_argmax[sentence_i] == 1: current_type = 'B is-a A' print('%s\t%s\t\t%s\t' % (current_type, ' '.join(sentence), str(actual_output_softmax[sentence_i])))
def main(argv=None): restore_param = util.load_from_dump(os.path.join(FLAGS.train_dir, 'flags.cPickle')) restore_param['train_dir'] = FLAGS.train_dir source_path = os.path.join(restore_param['data_dir'], 'test_cs_unlabeled_data_combined.txt') target_path = os.path.join(restore_param['data_dir'], 'test_cs_labels_combined.txt') vocab_path = os.path.join(restore_param['data_dir'], 'test_cs_vocab_combined') _, data = util.read_data(source_path, target_path, restore_param['sent_len'], train_size=restore_param['train_size'], hide_key_phrases=restore_param.get('hide_key_phrases', False)) pre, rec, x_input, expected_output, actual_output = evaluate(data, restore_param) actual_output_exp = np.exp(actual_output) actual_output_softmax = actual_output_exp / np.sum(actual_output_exp, axis=1, keepdims=True) output_difference = np.sum(np.abs(actual_output_softmax - expected_output), axis=1) sentence_indices_input = x_input[:,:-2] _,rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path) sentence_input = preprocessing_util.indices_to_sentences(sentence_indices_input,rev_vocab) kp_indices_input = x_input[:,-2:] print('Diff\tType\tSentence\t\tExpected Score (A is-a B, B is-a A, Neither)\tActual Score') for sentence_i, sentence in enumerate(sentence_input): # Label the key phrases of interest in the current sentence with *. sentence[kp_indices_input[sentence_i,1]] += '*' sentence[kp_indices_input[sentence_i,0]] += '*' current_type = 'Neither' if expected_output[sentence_i,0] == 1: current_type = 'A is-a B' elif expected_output[sentence_i,1] == 1: current_type = 'B is-a A' print('%.3f\t%s\t%s\t\t%s\t%s\t' % (output_difference[sentence_i], current_type, ' '.join(sentence), str(expected_output[sentence_i]), str(actual_output_softmax[sentence_i]))) util.dump_to_file(os.path.join(FLAGS.train_dir, 'results.cPickle'), {'precision': pre, 'recall': rec})
def main(argv=None): # Flags are defined in train.py if FLAGS.hide_key_phrases: raise AssertionError( "Please turn the hide_key_phrases off for co-training.") # First generate cross validation data if it does not exist. if not os.path.exists(FLAGS.cross_validation_dir): print("Cross validation data folder does not exist. Creating one.") os.mkdir(FLAGS.cross_validation_dir) source_path = os.path.join( FLAGS.data_dir, 'test_cs_unlabeled_data_combined_inferred.txt') target_path = os.path.join(FLAGS.data_dir, 'test_cs_labels_combined_inferred.txt') cross_validation_split(source_path, target_path, FLAGS.cross_validation_dir, fold_number=FLAGS.cross_validation_fold) for cross_val_round_i in range(FLAGS.cross_validation_fold): if not os.path.exists(FLAGS.train_dir): os.mkdir(FLAGS.train_dir) latest_sentence_checkpoint_dir = None latest_pair_checkpoint_dir = None latest_checkpoint_dir = None used_unlabeled_kp_pair_set = set() # The validation set is separate from the test and training set from the very beginning. val_source_path = os.path.join( FLAGS.cross_validation_dir, "cross_validation_val_%d_data.txt" % (cross_val_round_i)) val_target_path = os.path.join( FLAGS.cross_validation_dir, "cross_validation_val_%d_labels.txt" % (cross_val_round_i)) val_labeled_data, val_labeled_result = util.read_data_labeled_part( val_source_path, val_target_path, FLAGS.sent_len, shuffle=False) # For legacy code reasons, I have to add a None column to the training data... val_data = np.array( zip(val_labeled_data, val_labeled_result, [None] * val_labeled_result.shape[0])) val_precision = [] val_recall = [] val_pr_auc = [] # Precision recall area under the curve. for round_i in range(FLAGS.max_co_training_rounds): # load dataset if round_i == 0: source_path = os.path.join( FLAGS.cross_validation_dir, "cross_validation_train_%d_data.txt" % (cross_val_round_i)) target_path = os.path.join( FLAGS.cross_validation_dir, "cross_validation_train_%d_labels.txt" % (cross_val_round_i)) # source_path = os.path.join(FLAGS.data_dir, 'test_cs_unlabeled_data_combined_inferred_train.txt') # target_path = os.path.join(FLAGS.data_dir, 'test_cs_labels_combined_inferred_train.txt') else: source_path = os.path.join( latest_checkpoint_dir, 'test_cs_unlabeled_data_combined_round_%d.txt' % (round_i)) target_path = os.path.join( latest_checkpoint_dir, 'test_cs_labels_combined_round_%d.txt' % (round_i)) train_data, test_data = util.read_data(source_path, target_path, FLAGS.sent_len, attention_path=None, train_size=FLAGS.train_size, hide_key_phrases=False) # I probably need to implement getting all the sentences with the same kp here as well? train_data_hide_kp, test_data_hide_kp = util.read_data( source_path, target_path, FLAGS.sent_len, attention_path=None, train_size=FLAGS.train_size, hide_key_phrases=True) print("Round %d. Reading labeled data from previous round." % (round_i)) labeled_data, labeled_result = util.read_data_labeled_part( source_path, target_path, FLAGS.sent_len, shuffle=False) unlabeled_data = util.read_data_unlabeled_part( source_path, target_path, FLAGS.sent_len, shuffle=False, hide_key_phrases=False) unlabeled_data_hide_kp = util.read_data_unlabeled_part( source_path, target_path, FLAGS.sent_len, shuffle=False, hide_key_phrases=True) # For each round, we draw a fresh set of unlabeled data and label them using the trained classifier. current_unlabeled_data, used_unlabeled_kp_pair_set, current_drawn_indices = draw_from_unused_unlabeled( unlabeled_data, used_unlabeled_kp_pair_set, FLAGS.test_size_per_round) current_unlabeled_data_hide_kp = [ unlabeled_data_hide_kp[i] for i in current_drawn_indices ] # Currently this one works, but we need a version that throws away used ones. So we need to keep track of which # ones we've used. # current_unlabeled_data, current_drawn_indices = draw_from_unlabeled(unlabeled_data, # FLAGS.test_size_per_round) # current_unlabeled_data_hide_kp = [unlabeled_data_hide_kp[i] for i in current_drawn_indices] additional_label_index = [] additional_label_result = [] for classifier_i in range(2): additional_label_index.append([]) additional_label_result.append([]) if _is_sentence_train(classifier_i): train.train(train_data_hide_kp, test_data_hide_kp) latest_sentence_checkpoint_dir = util.get_latest_checkpoint_dir( FLAGS.train_dir) else: train_kp_pair_classifier.train(train_data, test_data) latest_pair_checkpoint_dir = util.get_latest_checkpoint_dir( FLAGS.train_dir) # Refresh the latest checkpoint. latest_checkpoint_dir = util.get_latest_checkpoint_dir( FLAGS.train_dir) restore_param = util.load_from_dump( os.path.join(latest_checkpoint_dir, 'flags.cPickle')) restore_param['train_dir'] = latest_checkpoint_dir if _is_sentence_train(classifier_i): x_input, actual_output = label.label( current_unlabeled_data_hide_kp, restore_param) else: x_input, actual_output = train_kp_pair_classifier.label( current_unlabeled_data, restore_param) actual_output_exp = np.exp(actual_output) actual_output_softmax = actual_output_exp / np.sum( actual_output_exp, axis=1, keepdims=True) actual_output_argmax = np.argmax(actual_output_softmax, axis=1) # If we do not want "Neither" relation, then calculate max on only the first 2 dimensions. # sentence_i_list = np.argsort(-np.max(actual_output_softmax[..., :2], axis=1)).tolist() if FLAGS.use_product_method: sentence_i_list = range(actual_output_softmax.shape[0]) else: sentence_i_list = np.argsort( -np.max(actual_output_softmax, axis=1)).tolist() # We need the version with key phrases not replaced in order to print things correctly. sentence_indices_input = current_unlabeled_data[:, :-2] vocab_path = os.path.join(restore_param['data_dir'], 'test_cs_vocab_combined') _, rev_vocab = preprocessing_util.initialize_vocabulary( vocab_path) sentence_input = preprocessing_util.indices_to_sentences( sentence_indices_input, rev_vocab, ignore_pad=True) kp_indices_input = current_unlabeled_data[:, -2:] with open( os.path.join(latest_checkpoint_dir, 'added_instances.tsv'), "w") as inferred_instances_f: inferred_instances_f.write( 'Type\tSentence\t\tProbability [A is-a B, B is-a A, Neither]\n' ) additional_label_num_positive = 0 additional_label_num_negative = 0 for sentence_i in sentence_i_list: # # This is the current max probability # current_softmax = actual_output_softmax[sentence_i,actual_output_argmax[sentence_i]] sentence = sentence_input[sentence_i] # Label the key phrases of interest in the current sentence with *. sentence[kp_indices_input[sentence_i, 1]] += '*' sentence[kp_indices_input[sentence_i, 0]] += '*' if actual_output_argmax[sentence_i] == 2: current_type = 'Neither' if not FLAGS.use_product_method and additional_label_num_negative >= FLAGS.co_training_has_relation_num_label_negative: continue else: additional_label_num_negative += 1 if actual_output_argmax[sentence_i] == 0: current_type = 'A is-a B' if not FLAGS.use_product_method and additional_label_num_positive >= FLAGS.co_training_has_relation_num_label_positive: continue else: additional_label_num_positive += 1 elif actual_output_argmax[sentence_i] == 1: current_type = 'B is-a A' if not FLAGS.use_product_method and additional_label_num_positive >= FLAGS.co_training_has_relation_num_label_positive: continue else: additional_label_num_positive += 1 inferred_instances_f.write( '%s\t%s\t\t%s\n' % (current_type, ' '.join(sentence), str(actual_output_softmax[sentence_i]))) if not FLAGS.use_product_method: additional_label_index[classifier_i].append( sentence_i) # If use_product_method is off, then the result is the label. current_additional_label_result = np.zeros((3, )) current_additional_label_result[ actual_output_argmax[sentence_i]] = 1 additional_label_result[classifier_i].append( current_additional_label_result) if additional_label_num_positive >= FLAGS.co_training_has_relation_num_label_positive and \ additional_label_num_negative >= FLAGS.co_training_has_relation_num_label_negative: break else: # If use_product_method is on, then the result is the output softmax, i.e. probability. current_additional_label_result = actual_output_softmax[ sentence_i] additional_label_result[classifier_i].append( current_additional_label_result) print( "Number of additional data points added through co-training classifier %d" ": %d positives and %d negatives out of %d unlabeled instances." % (classifier_i, additional_label_num_positive, additional_label_num_negative, len(sentence_i_list))) # Check if there are any conflicts and merge the additional labels labeled by the two classifier. if not FLAGS.use_product_method: merged_additional_label_index, merged_additional_label_result = check_conflict_and_merge( additional_label_index, additional_label_result) else: merged_additional_label_index, merged_additional_label_result = compute_product_and_save( additional_label_result, latest_checkpoint_dir, sentence_input, kp_indices_input) latest_checkpoint_dir = util.get_latest_checkpoint_dir( FLAGS.train_dir) save_source_path = os.path.join( latest_checkpoint_dir, 'test_cs_unlabeled_data_combined_round_%d.txt' % (round_i + 1)) save_target_path = os.path.join( latest_checkpoint_dir, 'test_cs_labels_combined_round_%d.txt' % (round_i + 1)) # Now recover the original index in the unlabeled data. merged_additional_label_index = [ current_drawn_indices[i] for i in merged_additional_label_index ] # Save the additionally labeled 2p+2n examples. save_additional_label(unlabeled_data, merged_additional_label_index, merged_additional_label_result, labeled_data, labeled_result, save_source_path, save_target_path) # I also need to get rid of those inferred instances from the whole bag of unlabeled dataset that we're drawing # from at each round. before_inference_unlabeled_data = util.read_data_unlabeled_part( save_source_path, save_target_path, FLAGS.sent_len, shuffle=False) inferred_additional_label_index, inferred_additional_label_result = infer_from_labeled( save_source_path, save_target_path, FLAGS.sent_len, vocab_path, do_save=True, save_source_path=save_source_path, save_target_path=save_target_path) inferred_additional_data = before_inference_unlabeled_data[ inferred_additional_label_index] inferred_additional_sentence_index = inferred_additional_data[:, : -2] inferred_additional_kp_index = inferred_additional_data[:, -2:] inferred_additional_sentence_input = preprocessing_util.indices_to_sentences( inferred_additional_sentence_index, rev_vocab, ignore_pad=True) inferred_additional_label_result_argmax = np.argmax( inferred_additional_label_result, axis=1) with open( os.path.join(latest_checkpoint_dir, 'inferred_instances.tsv'), "w") as inferred_instances_f: inferred_instances_f.write('Type\tSentence\n') for sentence_i in range(inferred_additional_kp_index.shape[0]): # # This is the current max probability # current_softmax = actual_output_softmax[sentence_i,actual_output_argmax[sentence_i]] sentence = inferred_additional_sentence_input[sentence_i] # Label the key phrases of interest in the current sentence with *. sentence[inferred_additional_kp_index[sentence_i, 1]] += '*' sentence[inferred_additional_kp_index[sentence_i, 0]] += '*' if inferred_additional_label_result_argmax[ sentence_i] == 2: current_type = 'Neither' if inferred_additional_label_result_argmax[ sentence_i] == 0: current_type = 'A is-a B' elif inferred_additional_label_result_argmax[ sentence_i] == 1: current_type = 'B is-a A' inferred_instances_f.write( '%s\t%s\n' % (current_type, ' '.join(sentence))) # Now all is left is to use the validation dataset to calculate the area under precision recall curve. val_precision.append([[[] for _ in range(3)] for _ in range(3)]) val_recall.append([[[] for _ in range(3)] for _ in range(3)]) val_pr_auc.append([[0.0, 0.0, 0.0] for _ in range(3)]) # Each time we calculate the precision recall for classifier 1, 2, and combined. for classifier_j in range(3): if classifier_j == 0: # Use classifier 1. restore_param = util.load_from_dump( os.path.join(latest_sentence_checkpoint_dir, 'flags.cPickle')) restore_param['train_dir'] = latest_sentence_checkpoint_dir _, val_actual_output = label.label(val_labeled_data, restore_param) elif classifier_j == 1: # Use classifier 2. restore_param = util.load_from_dump( os.path.join(latest_pair_checkpoint_dir, 'flags.cPickle')) restore_param['train_dir'] = latest_pair_checkpoint_dir _, val_actual_output = train_kp_pair_classifier.label( val_labeled_data, restore_param) else: # Use both classifier and, due to design choice of caring more about precision than recall, label # an instance as having a subcategory relation only when both classifier agrees, otherwise output # no relation, aka `Neither`. restore_param = util.load_from_dump( os.path.join(latest_sentence_checkpoint_dir, 'flags.cPickle')) restore_param['train_dir'] = latest_sentence_checkpoint_dir _, val_actual_output_sentence = label.label( val_labeled_data, restore_param) restore_param = util.load_from_dump( os.path.join(latest_pair_checkpoint_dir, 'flags.cPickle')) restore_param['train_dir'] = latest_pair_checkpoint_dir _, val_actual_output_pair = train_kp_pair_classifier.label( val_labeled_data, restore_param) val_actual_output_sentence_argmax = np.argmax( val_actual_output_sentence, axis=1) val_actual_output_pair_argmax = np.argmax( val_actual_output_pair, axis=1) # Label the actual output as [1,0,0] if both classify as A is B, [0,1,0] if both classify as B is A, # and [0,0,1] in all other situations. val_actual_output = np.array([[ 1 if k == val_actual_output_sentence_argmax[j] else 0 for k in range(3) ] if np.all( val_actual_output_sentence_argmax[j] == val_actual_output_pair_argmax[j]) else [ 0, 0, 1 ] for j in range(val_actual_output_sentence.shape[0])]) val_actual_output_exp = np.exp(val_actual_output) val_actual_output_softmax = val_actual_output_exp / np.sum( val_actual_output_exp, axis=1, keepdims=True) for i in range(3): val_precision[round_i][classifier_j][i], val_recall[ round_i][classifier_j][i], _ = precision_recall_curve( val_labeled_result[:, i], val_actual_output_softmax[:, i]) val_pr_auc[round_i][classifier_j][ i] = average_precision_score( val_labeled_result[:, i], val_actual_output_softmax[:, i], ) # Lastly output the precision recall file for each classifier and each category. with open(os.path.join(latest_checkpoint_dir, 'pr_auc.tsv'), "w") as f: for classifier_j in range(3): for i in range(3): f.write( "Classifier%d_%s\t%s\n" % (classifier_j, CATEGORY_NAME[i], "\t".join([ str(val_pr_auc[round_i][classifier_j][i]) for round_i in range(FLAGS.max_co_training_rounds) ]))) np.save(os.path.join(latest_checkpoint_dir, 'precision_recall_data'), np.array([val_precision, val_recall, val_pr_auc]))
def main(argv=None): restore_param = util.load_from_dump(os.path.join(FLAGS.train_dir, 'flags.cPickle')) restore_param['train_dir'] = FLAGS.train_dir source_path = os.path.join(restore_param['data_dir'], 'test_cs_unlabeled_data_combined.txt') target_path = os.path.join(restore_param['data_dir'], 'test_cs_labels_combined.txt') vocab_path = os.path.join(restore_param['data_dir'], 'test_cs_vocab_combined') unlabeled_data = util.read_data_unlabeled_part(source_path, target_path, restore_param['sent_len']) data_size = unlabeled_data.shape[0] # # Now hard code to take the first 1000 # data_first_1000 = unlabeled_data x_input, actual_output = label(unlabeled_data, restore_param) actual_output_exp = np.exp(actual_output) actual_output_softmax = actual_output_exp / np.sum(actual_output_exp, axis=1, keepdims=True) actual_output_argmax = np.argmax(actual_output_softmax,axis=1) # Entropy = -sum(p * log p) so this is actually the negative of entropy. For sorting purpose I took out the neg. actual_output_entropy = np.sum(np.multiply(actual_output_softmax, np.log(actual_output_softmax)), axis=1) # The following are ways to rank what question should be asked first. # The first one uses entropy, but there might be some implementation errors. actual_output_entropy_argsort = np.argsort(actual_output_entropy, axis=0) # This doesn:t seem to give me the most uncertain ones??? in theory it does. or maybe it's just the model is too sure of everything. # The second one uses the softmax probability and only ask the one with highest probability in the first two # classes. # actual_output_entropy_argsort = np.argsort(-np.max(actual_output_softmax[...,:2], axis=1)) sentence_indices_input = x_input[:,:-2] _,rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path) sentence_input = preprocessing_util.indices_to_sentences(sentence_indices_input,rev_vocab) kp_indices_input = x_input[:,-2:] # # print('Sentence\t\tPredicted Score (A is-a B, B is-a A, Neither)\t') # for sentence_i, sentence in enumerate(sentence_input): # # Label the key phrases of interest in the current sentence with *. # sentence[kp_indices_input[sentence_i,1]] += '*' # sentence[kp_indices_input[sentence_i,0]] += '*' # if actual_output_argmax[sentence_i] == 2: # # current_type = 'Neither' # continue # if actual_output_argmax[sentence_i] == 0: # current_type = 'A is-a B' # elif actual_output_argmax[sentence_i] == 1: # current_type = 'B is-a A' # # print('%s\t%s\t\t%s\t' # % (current_type, ' '.join(sentence), str(actual_output_softmax[sentence_i]))) user_input = -1 num_user_labeled = 0 user_label_results = [] while user_input != 4 and num_user_labeled < data_size: sentence_i = actual_output_entropy_argsort[num_user_labeled] sentence = sentence_input[sentence_i] print('Key phrase pair\tSentence\t\tPredicted Score (A is-a B, B is-a A, Neither)\t') current_key_phrase_pair = sentence[kp_indices_input[sentence_i,0]] + ' ' + sentence[kp_indices_input[sentence_i,1]] # Label the key phrases of interest in the current sentence with *. sentence[kp_indices_input[sentence_i,1]] += '*' sentence[kp_indices_input[sentence_i,0]] += '*' print('%s\n%s\t\t%s\t' % (current_key_phrase_pair,' '.join(sentence), str(actual_output_softmax[sentence_i]))) user_input = raw_input('In your opinion, what should be the category of the key phrase pair? ' 'Please enter 1, 2, or 3. Enter 4 to stop answering.\n' '1. A is-a B\n2. B is-a A\n3. Neither.') user_input = util.get_valid_user_input(user_input, 1, 4) if user_input != 4: user_label_result = np.array([0,0,0]) user_label_result[user_input-1] = 1 user_label_results.append(user_label_result) num_user_labeled += 1 actual_output_entropy_indices = actual_output_entropy_argsort[:num_user_labeled] if len(user_label_results) > 0: labeled_data, labeled_result = util.read_data_labeled_part(source_path, target_path, restore_param['sent_len'], shuffle=False) user_label_results = np.array(user_label_results) save_additional_label(unlabeled_data, actual_output_entropy_indices, user_label_results,labeled_data,labeled_result, source_path, target_path)
def main(argv): restore_param = util.load_from_dump( os.path.join(FLAGS.train_dir, 'flags.cPickle')) restore_param['train_dir'] = FLAGS.train_dir if argv is not None: source_path = argv[1] target_path = argv[2] if source_path is None: source_path = os.path.join(restore_param['data_dir'], 'test_cs_unlabeled_data_combined.txt') if target_path is None: target_path = os.path.join(restore_param['data_dir'], 'test_cs_labels_combined.txt') vocab_path = os.path.join(restore_param['data_dir'], 'test_cs_vocab_combined') labeled_data, labeled_result = util.read_data_labeled_part( source_path, target_path, restore_param['sent_len'], shuffle=False) labeled_data = np.array(labeled_data) labeled_result = np.array(labeled_result) data_size = labeled_data.shape[0] # # Now hard code to take the first 1000 # data_first_1000 = unlabeled_data sentence_indices_input = labeled_data[:, :-2] _, rev_vocab = preprocessing_util.initialize_vocabulary(vocab_path) sentence_input = preprocessing_util.indices_to_sentences( sentence_indices_input, rev_vocab) kp_indices_input = labeled_data[:, -2:] # # print('Sentence\t\tPredicted Score (A is-a B, B is-a A, Neither)\t') # for sentence_i, sentence in enumerate(sentence_input): # # Label the key phrases of interest in the current sentence with *. # sentence[kp_indices_input[sentence_i,1]] += '*' # sentence[kp_indices_input[sentence_i,0]] += '*' # if actual_output_argmax[sentence_i] == 2: # # current_type = 'Neither' # continue # if actual_output_argmax[sentence_i] == 0: # current_type = 'A is-a B' # elif actual_output_argmax[sentence_i] == 1: # current_type = 'B is-a A' # # print('%s\t%s\t\t%s\t' # % (current_type, ' '.join(sentence), str(actual_output_softmax[sentence_i]))) with open(os.path.join(FLAGS.train_dir, 'labeled_dataset_human.csv'), 'w') as f: csv_writer = csv.writer(f) csv_writer.writerow([ 'Key phrase pair(separated by one space)', 'Sentence(Key phrase labeled with *)', '(Label)A is-a B', 'B is-a A', 'Neither' ]) for sentence_i in range(data_size): sentence = sentence_input[sentence_i] current_key_phrase_pair = sentence[kp_indices_input[ sentence_i, 0]] + ' ' + sentence[kp_indices_input[sentence_i, 1]] # Label the key phrases of interest in the current sentence with *. sentence[kp_indices_input[sentence_i, 1]] += '*' sentence[kp_indices_input[sentence_i, 0]] += '*' csv_writer.writerow([current_key_phrase_pair, ' '.join(sentence)] + labeled_result[sentence_i, ...].tolist())