def main(input_file: str, output_file: str, checking_file: str, keep_rate: float): keep_rate = float(keep_rate) _gen = Random.get_np_generator(12345) rstream = Random.stream(_gen.random_sample) # -- # read input stat = {} input_sents = list( yield_sents(ReaderGetterConf().get_reader(input_path=input_file))) stat["input"] = get_stat(input_sents) if checking_file: checking_sents = list( yield_sents( ReaderGetterConf().get_reader(input_path=checking_file))) stat["check"] = get_stat(checking_sents) # collect keys hit_keys = set() for one_check_sent in checking_sents: tok_key = ''.join(one_check_sent.seq_word.vals).lower() tok_key = ''.join(tok_key.split()) # split and join again hit_keys.add(tok_key) # filter filtered_sents = [] for one_input_sent in input_sents: tok_key = ''.join(one_input_sent.seq_word.vals).lower() tok_key = ''.join(tok_key.split()) # split and join again if tok_key not in hit_keys: filtered_sents.append(one_input_sent) else: filtered_sents = input_sents stat["filter"] = get_stat(filtered_sents) # sample if keep_rate < 1.: sample_sents = [ s for r, s in zip(rstream, filtered_sents) if r < keep_rate ] elif keep_rate > 10: sample_sents = [z for z in filtered_sents] for _ in range(10): _gen.shuffle(sample_sents) sample_sents = sample_sents[:int(keep_rate)] else: sample_sents = filtered_sents stat["sample"] = get_stat(sample_sents) # write if os.path.exists(output_file): assert False, f"File exists: {output_file}, delete it first!" if output_file: with WriterGetterConf().get_writer(output_path=output_file) as writer: writer.write_insts(sample_sents) # stat zlog( f"Read {input_file}, check {checking_file}, output {output_file}, stat:" ) OtherHelper.printd(stat)
def main(args): conf = MainConf() conf.update_from_args(args) zlog(f"Ready to evaluate with: {conf.gold} {conf.pred} => {conf.output}") # -- final_insts = list(conf.gold.get_reader()) # to modify inplace! stat = Counter() gold_sents = list(yield_sents(final_insts)) pred_sents = list(yield_sents(conf.pred.get_reader())) assert len(gold_sents) == len(pred_sents) for g_sent, p_sent in zip(gold_sents, pred_sents): stat["sent"] += 1 slen = len(g_sent) assert slen == len(p_sent) stat["tok"] += slen # put features assert len(g_sent.events) == len(p_sent.events) for g_evt, p_evt in zip(g_sent.events, p_sent.events): assert g_evt.mention.is_equal( p_evt.mention) and g_evt.label == p_evt.label stat["frame"] += 1 stat["ftok"] += slen assert len(g_evt.args) == len(p_evt.args) # -- evt_widx = g_evt.mention.shead_widx g_paths = [[ len(z) for z in g_evt.sent.tree_dep.get_path(ii, evt_widx) ] for ii in range(slen)] p_paths = [[ len(z) for z in p_evt.sent.tree_dep.get_path(ii, evt_widx) ] for ii in range(slen)] stat["ftok_corr"] += sum(a == b for a, b in zip(g_paths, p_paths)) # assign g_evt.info["dpaths"] = [g_paths, p_paths ] # [2(g/p), SLEN, 2(word, predicate)] # -- # -- # report OtherHelper.printd(stat) zlog( f"FtokPathAcc: {stat['ftok_corr']} / {stat['ftok']} = {stat['ftok_corr']/stat['ftok']}" ) # -- # write if conf.output.output_path: with conf.output.get_writer() as writer: writer.write_insts(final_insts)
def do_eval(self, code: str, mname: str = ""): s, m, vs = self, OtherHelper.get_module( self), self.vars # convenient local variable if mname: import importlib m2 = importlib.import_module(mname) ret = eval(code) return ret
def do_sort(self, insts_target: str, kcode: str) -> List: s, m, vs = self, OtherHelper.get_module(self), self.vars _ff = compile(kcode, "", "eval") insts = self.get_and_check_type(insts_target, list) tmp_tuples = [(d, eval(_ff)) for d in insts] tmp_tuples.sort(key=lambda x: x[1]) ret = [x[0] for x in tmp_tuples] zlog(f"Sort by key={kcode}: len = {len(ret)}") return ret
def do_cal_pd(self, inst_pd: str, scode: str): s, m, vs = self, OtherHelper.get_module(self), self.vars _ff = compile(scode, "", "eval") d = self.get_and_check_type(inst_pd, pd.DataFrame) # -- ret = eval(_ff) zlog( f"Calculation on pd.DataFrame by {scode}, and get another one as: {str(ret)}" ) return ret
def main(*args): conf: MainConf = init_everything(MainConf(), args) # -- # first read them all src_sents, trg_sents = list(yield_sents(conf.src_input.get_reader())), \ list(yield_sents(conf.trg_input.get_reader())) assert len(src_sents) == len(trg_sents) cc = Counter() conv = Converter(conf) # -- outputs = [] for src_sent, trg_sent in zip(src_sents, trg_sents): res = conv.convert(src_sent, trg_sent, cc) outputs.append(res) zlog("Stat:") OtherHelper.printd(cc) # -- with conf.output.get_writer() as writer: writer.write_insts(outputs)
def do_get_pd(self, insts_target: str, gcode: str) -> pd.DataFrame: s, m, vs = self, OtherHelper.get_module(self), self.vars _ff = compile(gcode, "", "eval") insts = self.get_and_check_type(insts_target, list) # -- fields = [eval(_ff) for d in insts] ret = pd.DataFrame(fields) zlog( f"Group {len(insts)} instances by {gcode} to pd.DataFrame shape={ret.shape}." ) return ret
def do_ann_attach(self, name: str): s, m, vs = self, OtherHelper.get_module(self), self.vars # -- # todo(note): keep this special name for this special purpose if name == "_detach": self._cur_ann_task = None self.set_var("_cur_ann_var_name", None) return # -- z = self.get_and_check_type(name, AnnotationTask) zlog(f"Attach ann_task: from {self.cur_ann_task} to {z}") self.set_var("_cur_ann_var_name", z) # set special name!!
def do_join(self, insts_target: str, jcode: str) -> List: s, m, vs = self, OtherHelper.get_module(self), self.vars _ff = compile(jcode, "", "eval") insts = self.get_and_check_type(insts_target, list) ret = [] for d in insts: ret0 = eval(_ff) ret.extend(ret0) # ret0 = [eval(_ff) for d in insts] # ret = list(chain.from_iterable(ret0)) zlog(f"Join-list by {jcode}: from {len(insts)} to {len(ret)}") return ret
def do_filter(self, insts_target: str, fcode: str) -> List: s, m, vs = self, OtherHelper.get_module(self), self.vars _ff = compile(fcode, "", "eval") insts = self.get_and_check_type(insts_target, list) ret = [] for d in insts: if eval(_ff): ret.append(d) # ret = [d for d in insts if eval(_ff)] zlog( f"Filter by {fcode}: from {len(insts)} to {len(ret)}, {len(ret)/(len(insts)+1e-7)}" ) return ret
def main(input_path): insts = list(ReaderGetterConf().get_reader( input_path=input_path)) # read from stdin all_sents = list(yield_sents(insts)) set_ee_heads(insts) # -- cc = Counter() for sent in all_sents: cc["sent"] += 1 arg_maps = [[] for _ in range(len(sent))] for evt in sent.events: cc["evt"] += 1 for arg in evt.args: # -- # no VERB if arg.role in ["V", "C-V"]: cc["argV"] += 1 continue # -- cc["arg"] += 1 ef = arg.arg shidx = ef.mention.shead_widx span = ef.mention.get_span() arg_maps[shidx].append(ZObject(evt=evt, ef=ef, span=span)) # check for all tokens cc["tok"] += len(arg_maps) for one_objs in arg_maps: cc[f"tok_N{len(one_objs)}"] += 1 all_spans = set(z.span for z in one_objs) cc[f"tok_N{len(one_objs)}S{len(all_spans)}"] += 1 # -- if len(one_objs) > 0: cc[f"tok_diff={len(all_spans)>1}"] += 1 if len(all_spans) > 1: breakpoint() pass # -- # -- OtherHelper.printd(cc)
def do_corr(self, insts_target: str, acode: str, bcode: str): s, m, vs = self, OtherHelper.get_module(self), self.vars _ffa = compile(acode, "", "eval") _ffb = compile(bcode, "", "eval") insts = self.get_and_check_type(insts_target, list) a_vals, b_vals = [], [] for d in insts: a_vals.append(eval(_ffa)) b_vals.append(eval(_ffb)) # -- from scipy.stats import pearsonr, spearmanr zlog(f"Pearson={pearsonr(a_vals,b_vals)}") zlog(f"Spearman={spearmanr(a_vals,b_vals)}") return None
def do_break_eval(self, insts_target: str, pcode: str, gcode: str, corr_code="d.pred.label == d.gold.label", sort_key='-1', truncate_items=100, pdb=False): s, m, vs = self, OtherHelper.get_module(self), self.vars sort_key = int(sort_key) _fp, _fg = compile(pcode, "", "eval"), compile(gcode, "", "eval") _fcorr = compile(corr_code, "", "eval") insts = self.get_and_check_type(insts_target, list) # -- res = {} for d in insts: corr = 0 # -- no_pred = False try: # use try/except to set this! key_p = eval(_fp) except: no_pred = True # -- if not no_pred and d.gold is not None: corr = eval(_fcorr) if not no_pred: key_p = eval(_fp) if key_p not in res: res[key_p] = F1EvalEntry() res[key_p].record_p(int(corr)) if d.gold is not None: key_g = eval(_fg) if key_g not in res: res[key_g] = F1EvalEntry() res[key_g].record_r(int(corr)) # final details = [(k, ) + v.details for k, v in res.items()] details = sorted(details, key=(lambda x: x[sort_key]), reverse=True) # -- pdf = pd.DataFrame(details) pdf_str = pdf[:int(truncate_items)].to_string() zlog( f"Break-eval {len(insts)} instances by {pcode}/{gcode}:\n{pdf_str}" ) if pdb: breakpoint() return res
def _do_group(self, insts_target: str, gcode: str, sum_key: str, visitor: RecordNodeVisitor) -> RecordNode: s, m, vs = self, OtherHelper.get_module(self), self.vars _ff = compile(gcode, "", "eval") insts = self.get_and_check_type(insts_target, list) # collect all seqs ret = RecordNode.new_root() for d in insts: ret.record_seq(eval(_ff), obj=d) # visitor if visitor is not None: try: ret.rec_visit(visitor) except: zlog(traceback.format_exc()) zlog("Error of visitor.") # some slight summaries here all_count = len(insts) if not str.isidentifier(sum_key): sum_key = eval(sum_key) # eval the lambda expression all_nodes = ret.get_descendants(key=sum_key) ss = [] for z in all_nodes: all_parents = z.get_antecedents() if len(all_parents) > 0: assert all_parents[0].count == all_count perc_info = ', '.join( [f"{z.count/(zp.count+1e-6):.4f}" for zp in all_parents]) ss.append([ '==' * len(z.path), str(z.path), f"{z.count}({perc_info})", z.get_content() ]) # sstr = "\n".join(ss) # sstr = "" # pd.set_option('display.width', 1000) # pd.set_option('display.max_colwidth', 1000) pdf = pd.DataFrame(ss) pdf_str = pdf.to_string() zlog( f"Group {len(insts)} instances by {gcode}, all {len(ss)} nodes:\n{pdf_str}" ) return ret
def do_ann_new(self, insts_target: str, fcode: str = None, try_attach=1): s, m, vs = self, OtherHelper.get_module(self), self.vars # -- assert self._cur_cmd.target is not None, "Should assign this to a var to avoid accidental loss!" vs = self.vars insts = self.get_and_check_type(insts_target, list) if fcode is None: new_task = self.__class__.get_ann_type()(insts) else: new_task = eval(fcode)(insts) # todo(note): here need auto save? try_attach = bool(int(try_attach)) if try_attach: if self.cur_ann_task is not None: zlog("Detach current task and try attach the new one!!") self._cur_ann_task = None # note: directly set name, which will be assigned later # todo(+N): maybe source of certain bugs? self.set_var("_cur_ann_var_name", self._cur_cmd.target) # set special name!! zlog("New ann task, and ann_var_name set!") return new_task