def show_tfrecord(file_path): itr = load_record_v2(file_path) tokenizer = get_tokenizer() name = os.path.basename(file_path) html = HtmlVisualizer(name + ".html") for features in itr: input_ids = take(features["input_ids"]) alt_emb_mask = take(features["alt_emb_mask"]) tokens = tokenizer.convert_ids_to_tokens(input_ids) p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids) p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids) p_cells = [ Cell(p_tokens[i], 100 if p_mask[i] else 0) for i in range(len(p_tokens)) ] h_cells = [ Cell(h_tokens[i], 100 if h_mask[i] else 0) for i in range(len(h_tokens)) ] label = take(features["label_ids"])[0] html.write_paragraph("Label : {}".format(label)) html.write_table([p_cells]) html.write_table([h_cells])
def convert_to_unpaired(source_path, output_path): def feature_transformer(feature): new_features_1 = collections.OrderedDict() new_features_2 = collections.OrderedDict() def put(feature_name): return create_int_feature(take(feature[feature_name])) new_features_1["input_ids"] = put("input_ids1") new_features_1["input_mask"] = put("input_mask1") new_features_1["segment_ids"] = put("segment_ids1") new_features_1["label_ids"] = create_int_feature([1]) new_features_2["input_ids"] = put("input_ids2") new_features_2["input_mask"] = put("input_mask2") new_features_2["segment_ids"] = put("segment_ids2") new_features_2["label_ids"] = create_int_feature([0]) return new_features_1, new_features_2 writer = RecordWriterWrap(output_path) feature_itr = load_record_v2(source_path) for feature in feature_itr: new_features_1, new_features_2 = feature_transformer(feature) writer.write_feature(new_features_1) writer.write_feature(new_features_2) writer.close()
def tfrecord_convertor(source_path: FilePath, output_path: FilePath, feature_transformer): writer = RecordWriterWrap(output_path) feature_itr = load_record_v2(source_path) for feature in feature_itr: new_features = feature_transformer(feature) writer.write_feature(new_features) writer.close()
def work(self, job_id): file_path = os.path.join(self.lm_dir, str(job_id)) out_path = os.path.join(self.working_dir, str(job_id)) lm_itr = load_record_v2(file_path) random.shuffle(self.tt_entries) idx = 0 writer = RecordWriterWrap(out_path) for lm_entry in lm_itr: nli_entry = self.tt_entries[idx] new_features = combine_feature(lm_entry, nli_entry) writer.write_feature(new_features)
def generate_training_data(data_id): num_samples_list = open( os.path.join(working_path, "entry_prediction_n", data_id), "r").readlines() p = os.path.join(working_path, "entry_loss", "entry{}.pickle".format(data_id)) loss_outputs_list = pickle.load(open(p, "rb")) print("Loaded input data") loss_outputs = [] for e in loss_outputs_list: loss_outputs.extend(e["masked_lm_example_loss"]) print("Total of {} loss outputs".format(len(loss_outputs))) feature_itr = load_record_v2( os.path.join(working_path, "entry_prediction_tf.done", data_id)) instance_idx = 0 writer = tf.python_io.TFRecordWriter( os.path.join(working_path, "entry_prediction_train", data_id)) n = len(num_samples_list) for i in range(n): n_sample = int(num_samples_list[i]) assert n_sample > 0 first_inst = feature_itr.__next__() if instance_idx + n_sample >= len(loss_outputs): break if n_sample == 1: continue no_dict_loss = loss_outputs[instance_idx] instance_idx += 1 all_samples = [] for j in range(1, n_sample): feature = feature_itr.__next__() loss = loss_outputs[instance_idx] if loss < no_dict_loss * 0.9: label = 1 else: label = 0 new_features = collections.OrderedDict() for key in feature: new_features[key] = btd.create_int_feature(take(feature[key])) new_features["useful_entry"] = btd.create_int_feature([label]) example = tf.train.Example(features=tf.train.Features( feature=new_features)) writer.write(example.SerializeToString()) writer.close()
def visualize(filename, n_item): for idx, features in enumerate(load_record_v2(filename)): if idx > n_item: break keys = features.keys() for key in keys: feature = features[key] if feature.int64_list.value: values = feature.int64_list.value elif feature.float_list.value: values = feature.float_list.value print("{} : {}".format(key, values[:50]))
def visualize_prediction_data(data_id): tokenizer = get_tokenizer() num_samples_list = open( os.path.join(working_path, "entry_prediction_n", data_id), "r").readlines() p = os.path.join(working_path, "entry_loss", "entry{}.pickle".format(data_id)) loss_outputs_list = pickle.load(open(p, "rb")) print("Loaded input data") loss_outputs = [] for e in loss_outputs_list: loss_outputs.extend(e["masked_lm_example_loss"]) print("Total of {} loss outputs".format(len(loss_outputs))) instance_idx = 0 feature_itr = load_record_v2( os.path.join(working_path, "entry_prediction_tf.done", data_id)) n = len(num_samples_list) n = 100 html = HtmlVisualizer("entry_prediction.html") for i in range(n): n_sample = int(num_samples_list[i]) assert n_sample > 0 first_inst = feature_itr.__next__() feature = Feature2Text(first_inst, tokenizer) html.write_headline("Input:") html.write_paragraph(feature.get_input_as_text(True, True)) html.write_headline("Word:" + feature.get_selected_word_text()) if instance_idx + n_sample >= len(loss_outputs): break if n_sample == 1: continue rows = [] no_dict_loss = loss_outputs[instance_idx] row = [Cell(no_dict_loss, 0), Cell("")] rows.append(row) instance_idx += 1 for j in range(1, n_sample): feature = Feature2Text(feature_itr.__next__(), tokenizer) def_cell = Cell(feature.get_def_as_text()) loss = loss_outputs[instance_idx] hl_score = 100 if loss < no_dict_loss * 0.9 else 0 row = [Cell(loss, hl_score), def_cell] rows.append(row) instance_idx += 1 html.write_table(rows)
def do_verfiy(): pred_file_name = "1.pickle" record_file_name = "C:\\work\\Code\\Chair\\output\\1" p = os.path.join(output_path, pred_file_name) data = pickle.load(open(p, "rb")) data = flatten_batches(data) itr1 = load_record_v2(record_file_name) itr2 = data["prob1"] print("itr2 len", len(itr2)) cnt = 0 for _ in itr1: cnt += 1 print(cnt) for _ in itr2: cnt -= 1 print(cnt) print(cnt)
def count_terms(file_path): counter = Counter() for feature in load_record_v2(file_path): input_ids = take(feature["input_ids"]) alt_emb_mask = take(feature["alt_emb_mask"]) cur_words = [] for i in range(len(input_ids)): if alt_emb_mask[i]: cur_words.append(input_ids[i]) else: if cur_words: sig = " ".join([str(num) for num in cur_words]) counter[sig] += 1 cur_words = [] return counter
def verify_alt_emb(source_path, seq_set: List[List[int]]): all_tokens: Set[int] = set(flatten(seq_set)) def check_feature(feature): feature_d = {} for key in feature: v = take(feature[key]) feature_d[key] = v input_ids = feature_d["input_ids"] alt_emb_mask = feature_d["alt_emb_mask"] for i in range(len(input_ids)): if alt_emb_mask[i] and input_ids[i] not in all_tokens: print(i, input_ids[i]) feature_itr = load_record_v2(source_path) for feature in feature_itr: check_feature(feature)
def build_word_tf(continuation_tokens: Set[int], file_path): feature_itr = load_record_v2(file_path) counter = Counter() for feature in feature_itr: if not is_real_example(feature): continue input_ids = take(feature["input_ids"]) cur_word = [] for idx, token_id in enumerate(input_ids): if token_id in continuation_tokens: cur_word.append(token_id) else: if len(cur_word) > 1: word_sig = " ".join([str(t) for t in cur_word]) counter[word_sig] += 1 cur_word = [token_id] return counter
def get_correctness(filename, file_path): itr = load_record_v2(file_path) data = EstimatorPredictionViewerGosford(filename) correctness = [] for entry in data: features = itr.__next__() input_ids = entry.get_vector("input_ids") input_ids2 = take(features["input_ids"]) assert np.all(input_ids == input_ids2) label = take(features["label_ids"])[0] logits = entry.get_vector("logits") pred = np.argmax(logits) if pred == label: correctness.append(1) else: correctness.append(0) return correctness
def run(filename, n_item): loss_list = [] loss_list2 = [] loss_list3 = [] for idx, features in enumerate(load_record_v2(filename)): if idx > n_item: break keys = features.keys() loss1 = features["loss_base"].float_list.value loss2 = features["loss_target"].float_list.value mask = features["masked_lm_weights"].float_list.value print(loss1) loss_list.append( independent_model(loss1, loss2, mask, proportion_random)) loss_list2.append(diff_model(loss1, loss2, mask)) loss_list3.append(independent_model(loss1, loss2, mask, same)) print("independent (proportion random): ", average(loss_list)) print("diff : ", average(loss_list2)) print("independent (same): ", average(loss_list3))
def show_prediction(filename, file_path, correctness_1, correctness_2): data = EstimatorPredictionViewerGosford(filename) itr = load_record_v2(file_path) tokenizer = get_tokenizer() name = os.path.basename(filename) html = HtmlVisualizer(name + ".html") idx = 0 for entry in data: features = itr.__next__() input_ids = entry.get_vector("input_ids") input_ids2 = take(features["input_ids"]) assert np.all(input_ids == input_ids2) alt_emb_mask = take(features["alt_emb_mask"]) tokens = tokenizer.convert_ids_to_tokens(input_ids) p_tokens, h_tokens = split_p_h_with_input_ids(tokens, input_ids) p_mask, h_mask = split_p_h_with_input_ids(alt_emb_mask, input_ids) p_cells = [ Cell(p_tokens[i], 100 if p_mask[i] else 0) for i in range(len(p_tokens)) ] h_cells = [ Cell(h_tokens[i], 100 if h_mask[i] else 0) for i in range(len(h_tokens)) ] label = take(features["label_ids"])[0] logits = entry.get_vector("logits") pred = np.argmax(logits) if not correctness_1[idx] or not correctness_2[idx]: html.write_paragraph("Label : {} Correct: {}/{}".format( label, correctness_1[idx], correctness_2[idx])) html.write_table([p_cells]) html.write_table([h_cells]) idx += 1
def __init__(self, working_dir): self.working_dir = working_dir self.lm_dir = os.path.join(sydney_working_dir, "unmasked_pair_x3") tt_path = os.path.join(output_path, "ukp_512", "train_death_penalty") self.tt_entries = list(load_record_v2(tt_path))
def get_dir_all_itr(dir_path): for file_path in get_dir_files(dir_path): one_itr = load_record_v2(file_path) for item in one_itr: yield item
def do_fix(source_path, output_path): max_num_seg = 4 window_size = 512 seq_length = 512 * max_num_seg input_names1 = [ "input_ids1", "segment_ids1", "input_mask1", ] input_names2 = ["input_ids2", "input_mask2", "segment_ids2"] def feature_transformer(feature): new_features = collections.OrderedDict() def put(feature_name): return create_int_feature(take(feature[feature_name])) for left_right_idx in [1, 2]: input_names = [input_names1, input_names2][left_right_idx - 1] input_ids = take(feature["input_ids{}".format(left_right_idx)]) input_masks = take(feature["input_mask{}".format(left_right_idx)]) cls_loc = [] last_non_pad = -1 for i in range(seq_length): if input_ids[i] == 101: cls_loc.append(i) if input_masks[i]: last_non_pad = i assert last_non_pad >= 0 assert last_non_pad > cls_loc[-1] assert len(cls_loc) <= max_num_seg num_seg = len(cls_loc) input_building = {} for name in input_names: input_building[name] = [] for i in range(num_seg): st = cls_loc[i] ed = cls_loc[i + 1] if i + 1 < num_seg else last_non_pad + 1 pad_len = window_size - (ed - st) for input_name in input_names: arr = take(feature[input_name]) seq = arr[st:ed] + pad_len * [0] input_building[input_name].extend(seq) n_empty_seg = max_num_seg - num_seg for i in range(n_empty_seg): for input_name in input_names: input_building[input_name].extend([0] * window_size) for input_name in input_names: checksum1 = sum(input_building[input_name]) checksum2 = sum(take(feature[input_name])) assert checksum1 == checksum2 for input_name in input_names: new_features[input_name] = create_int_feature( input_building[input_name]) new_features["data_ids"] = put("data_ids") return new_features writer = RecordWriterWrap(output_path) feature_itr = load_record_v2(source_path) for feature in feature_itr: new_features_1 = feature_transformer(feature) writer.write_feature(new_features_1) writer.close()
def do(): pred_file_name = "RLPP_0.pickle" pred_file_name = "ukp_rel.pickle" record_file_name = "C:\\work\\Code\\Chair\\output\\unmasked_pair_x3_0" record_file_name = "C:\\work\\Code\\Chair\\output\\tf_enc" todo = [ ("RLPP_0.pickle", "C:\\work\\Code\\Chair\\output\\unmasked_pair_x3_0", "RLPP_wiki.html"), ("ukp_rel.pickle", "C:\\work\\Code\\Chair\\output\\tf_enc", "RLPP_ukp.html") ] x = [] y = [] for pred_file_name, record_file_name, out_name in todo: viewer = EstimatorPredictionViewerGosford(pred_file_name) html = HtmlVisualizer(out_name) itr1 = load_record_v2(record_file_name) itr2 = viewer.__iter__() cnt = 0 for features, entry in zip(itr1, itr2): cnt += 1 if cnt > 200: break input_ids1 = entry.get_tokens("input_ids") prob1 = entry.get_vector("prob1") prob2 = entry.get_vector("prob2") cells = viewer.cells_from_tokens(input_ids1) p1_l = [] p2_l = [] useful_l = [] row1 = [] row2 = [] row3 = [] row4 = [] for j, cell in enumerate(cells): p1 = float(prob1[j]) p2 = float(prob2[j]) x.append([p1]) y.append(p2) u = useful(p1, p2) score = (1 - u) * 100 cell.highlight_score = score row1.append(cell) row2.append(Cell(p1, score)) row3.append(Cell(p2, score)) row4.append(Cell(u, score)) p1_l.append(p1) p2_l.append(p2) useful_l.append(u) if len(row1) > 20: rows = [row1, row2, row3, row4] row1 = [] row2 = [] row3 = [] row4 = [] html.write_table(rows) html.write_paragraph("p1: {}".format(average(p1_l))) html.write_paragraph("p2: {}".format(average(p2_l))) html.write_paragraph("useful: {}".format(average(useful_l))) if average(useful_l) < 0.4: html.write_headline("Low Score") l = list(zip(x, y)) random.shuffle(l) l = l[:1000] x, y = zip(*l) lin = LinearRegression() lin.fit(x, y) poly = PolynomialFeatures(degree=4) X_poly = poly.fit_transform(x) poly.fit(X_poly, y) lin2 = LinearRegression() lin2.fit(X_poly, y) plt.scatter(x, y, color='blue') plt.plot(x, lin2.predict(poly.fit_transform(x)), color='red') plt.title('Polynomial Regression') plt.show()