def merge_cnn_dm(): cnn = "/scratch/cluster/jcxu/exComp/0.327,0.122,0.290-cnnTrue1.0-1True3-1093-cp_0.5" dm = "/scratch/cluster/jcxu/exComp/0.427,0.192,0.388-dmTrue1.0-1True3-10397-cp_0.7" total_pred = [] total_ref = [] f = open(cnn, "rb") cnn_dict = pickle.load(f) f.close() fine_cnn_pd = [] for x in cnn_dict["pred"]: fine_x = [easy_post_processing(s) for s in x] fine_cnn_pd.append(fine_x) total_pred += fine_cnn_pd # total_pred += cnn_dict["pred"] total_ref += cnn_dict["ref"] f = open(dm, "rb") dm_dict = pickle.load(f) f.close() fine_dm_pd = [] for x in dm_dict["pred"]: fine_x = [easy_post_processing(s) for s in x] fine_dm_pd.append(fine_x) total_pred += fine_dm_pd # cnnpd = [easy_post_processing(x) for x in dm_dict["pred"]] # total_pred += cnnpd # total_pred += dm_dict["pred"] # total_pred += dm_dict["pred"] total_ref += dm_dict["ref"] rouge_metrics = RougeStrEvaluation(name='mine') for p, r in zip(total_pred, total_ref): rouge_metrics(pred=p, ref=r) rouge_metrics.get_metric(True, note='test')
def test_easy_post_processing(self): inp=[ "In two years ' time , the Scandinavian nation is slated to become the first in the world to phase out radio entirely .", "Digitally , there are four times that number .", "Frum : Ukrainians want to enter EU and lessen dependence on Russia ; Putin fighting to stop it .", "-LRB- CNN -RRB- He might have just won one of sport 's most prestigious events , but it was n't long before Jordan Spieth 's thoughts turned to his autistic sister in the glow of victory . " ] for x in inp: y=easy_post_processing(x) print(y)
def run(self) -> (List, List, List, List): # go through everything _pred = [[] for _ in range(len(self.keep_threshold))] # _visuals = [[] for _ in range(len(self.keep_threshold))] # first keep all of the compressions in record self.read_sent_record_compressions(self.sent_idxs) # start diverging # delete those under threshold self.del_under_threshold() # diverge! # iterate: delete those already covered in context processed_words = self.iterative_rep_del() # reorder # sent_order: List[int] order = np.argsort(self.sort_order) # output for kepidx, kep in enumerate(self.keep_threshold): processed_words[kepidx] = [processed_words[kepidx][o] for o in order] # output something for evaluation # bag_pred_eval = [[] for _ in range(len(self.keep_threshold))] bag_pred_eval = [] for i, words in enumerate(processed_words): _tmp = [] for j, sent in enumerate(words): sent = [x for x in sent if (not x.startswith(sp_tok)) and (not x.startswith(sp_tok_rep))] out = easy_post_processing(" ".join(sent)) _tmp.append(out) bag_pred_eval.append(_tmp) # (optional) visualization if random.random() < 0.005: try: logger = logging.getLogger() logger.info("Prob\t\tType\t\tRatio\t\tRouge\t\tLen\t\tContent") for idx, d in enumerate(self.compressions): for key, value in d.items(): wt = [value['prob'], value['type'], value['ratio'], value['rouge'], value['len'], key] wt = "\t\t".join([str(x) for x in wt]) logger.info(wt) log_universal(Partition=self.part, Name=self.name, Abs=self.abs_str ) for idx in range(len(self.keep_threshold)): lis = processed_words[idx] lis_out = [" ".join(x) for x in lis] log_universal(Kep=self.keep_threshold[idx], Visual=" | ".join(lis_out)) # write del_record to disk f = open(self.ser_fname, 'a') js = json.dumps(self.del_record) f.write("\n") f.write(js) f.close() except ZeroDivisionError: pass # return processed_words, self.del_record, self.compressions, self.full_sents, bag_pred_eval return bag_pred_eval
def _dec_compression_one_step(self, predict_compression, sp_meta, word_sent: List[str], keep_threshold: List[float], context: List[List[str]] = None): full_set_len = set(range(len(word_sent))) # max_comp, _ = predict_compression.size preds = [full_set_len.copy() for _ in range(len(keep_threshold))] # Show all of the compression spans stat_compression = {} for comp_idx, comp_meta in enumerate(sp_meta): p = predict_compression[comp_idx][1] node_type, sel_idx, rouge, ratio = comp_meta if node_type != "BASELINE": selected_words = [x for idx, x in enumerate(word_sent) if idx in sel_idx] selected_words_str = "_".join(selected_words) stat_compression["{}".format(selected_words_str)] = { "prob": float("{0:.2f}".format(p)), # float("{0:.2f}".format()) "type": node_type, "rouge": float("{0:.2f}".format(rouge)), "ratio": float("{0:.2f}".format(ratio)), "sel_idx": sel_idx, "len": len(sel_idx) } stat_compression_order = OrderedDict( sorted(stat_compression.items(), key=lambda item: item[1]["prob"], reverse=True)) # Python 3 for idx, _keep_thres in enumerate(keep_threshold): history: List[str] = context[idx] his_set = set((" ".join(history)).split(" ")) for key, value in stat_compression_order.items(): p = value['prob'] sel_idx = value['sel_idx'] sel_txt = set([word_sent[x] for x in sel_idx]) if sel_txt - his_set == set(): # print("Save big!") # print("Context: {}\tCandidate: {}".format(his_set, sel_txt)) preds[idx] = preds[idx] - set(value['sel_idx']) continue if p > _keep_thres: preds[idx] = preds[idx] - set(value['sel_idx']) preds = [list(x) for x in preds] for pred in preds: pred.sort() # Visual output visual_outputs: List[str] = [] words_for_evaluation: List[str] = [] meta_keep_ratio_word = [] for idx, compression in enumerate(preds): output = [word_sent[jdx] if (jdx in compression) else '_' + word_sent[jdx] + '_' for jdx in range(len(word_sent))] visual_outputs.append(" ".join(output)) words = [word_sent[x] for x in compression] meta_keep_ratio_word.append(float(len(words) / len(word_sent))) # meta_kepp_ratio_span.append(1 - float(len(survery['type'][idx]) / len(sp_meta))) words = " ".join(words) words = easy_post_processing(words) # print(words) words_for_evaluation.append(words) d: List[List] = [] for kep_th, vis, words_eva, keep_word_ratio in zip(keep_threshold, visual_outputs, words_for_evaluation, meta_keep_ratio_word): d.append([kep_th, vis, words_eva, keep_word_ratio]) return stat_compression_order, d
path = "/scratch/cluster/jcxu/exComp" file = "0.325-0.120-0.289-cnnTrue1.0-1True-1093-cp_0.6" see_output = "/scratch/cluster/jcxu/data/cnndm_compar/pointgencov/cnn" ext_bag, model_bag, ext_dp_bag, see_bag = [], [], [], [] see_bag = read_abigail_output(see_output) with open(os.path.join(path, file), 'rb') as fd: x = pickle.load(fd) pred = x['pred'] ori = x['ori'] cnt = 0 for pre, o in zip(pred, ori): shuffle(pre) shuffle(o) p = [ meta_str_surgery( easy_post_processing(replace_lrbrrb( fix_vowel(x)))).lower() for x in pre ] o = [ meta_str_surgery( easy_post_processing(replace_lrbrrb( rm_head_cnn(x)))).lower() for x in o ] o_drop = [dropword(x) for x in o] o = [detok(x) for x in o] o_drop = [detok(x) for x in o_drop] p = [detok(x) for x in p] # print("\n".join(p)) # print("-" * 5) ext_bag += o ext_dp_bag += o_drop