Beispiel #1
0
def main(input_file: str, output_file: str, checking_file: str,
         keep_rate: float):
    keep_rate = float(keep_rate)
    _gen = Random.get_np_generator(12345)
    rstream = Random.stream(_gen.random_sample)
    # --
    # read input
    stat = {}
    input_sents = list(
        yield_sents(ReaderGetterConf().get_reader(input_path=input_file)))
    stat["input"] = get_stat(input_sents)
    if checking_file:
        checking_sents = list(
            yield_sents(
                ReaderGetterConf().get_reader(input_path=checking_file)))
        stat["check"] = get_stat(checking_sents)
        # collect keys
        hit_keys = set()
        for one_check_sent in checking_sents:
            tok_key = ''.join(one_check_sent.seq_word.vals).lower()
            tok_key = ''.join(tok_key.split())  # split and join again
            hit_keys.add(tok_key)
        # filter
        filtered_sents = []
        for one_input_sent in input_sents:
            tok_key = ''.join(one_input_sent.seq_word.vals).lower()
            tok_key = ''.join(tok_key.split())  # split and join again
            if tok_key not in hit_keys:
                filtered_sents.append(one_input_sent)
    else:
        filtered_sents = input_sents
    stat["filter"] = get_stat(filtered_sents)
    # sample
    if keep_rate < 1.:
        sample_sents = [
            s for r, s in zip(rstream, filtered_sents) if r < keep_rate
        ]
    elif keep_rate > 10:
        sample_sents = [z for z in filtered_sents]
        for _ in range(10):
            _gen.shuffle(sample_sents)
        sample_sents = sample_sents[:int(keep_rate)]
    else:
        sample_sents = filtered_sents
    stat["sample"] = get_stat(sample_sents)
    # write
    if os.path.exists(output_file):
        assert False, f"File exists: {output_file}, delete it first!"
    if output_file:
        with WriterGetterConf().get_writer(output_path=output_file) as writer:
            writer.write_insts(sample_sents)
    # stat
    zlog(
        f"Read {input_file}, check {checking_file}, output {output_file}, stat:"
    )
    OtherHelper.printd(stat)
Beispiel #2
0
def main(args):
    conf = MainConf()
    conf.update_from_args(args)
    zlog(f"Ready to evaluate with: {conf.gold} {conf.pred} => {conf.output}")
    # --
    final_insts = list(conf.gold.get_reader())  # to modify inplace!
    stat = Counter()
    gold_sents = list(yield_sents(final_insts))
    pred_sents = list(yield_sents(conf.pred.get_reader()))
    assert len(gold_sents) == len(pred_sents)
    for g_sent, p_sent in zip(gold_sents, pred_sents):
        stat["sent"] += 1
        slen = len(g_sent)
        assert slen == len(p_sent)
        stat["tok"] += slen
        # put features
        assert len(g_sent.events) == len(p_sent.events)
        for g_evt, p_evt in zip(g_sent.events, p_sent.events):
            assert g_evt.mention.is_equal(
                p_evt.mention) and g_evt.label == p_evt.label
            stat["frame"] += 1
            stat["ftok"] += slen
            assert len(g_evt.args) == len(p_evt.args)
            # --
            evt_widx = g_evt.mention.shead_widx
            g_paths = [[
                len(z) for z in g_evt.sent.tree_dep.get_path(ii, evt_widx)
            ] for ii in range(slen)]
            p_paths = [[
                len(z) for z in p_evt.sent.tree_dep.get_path(ii, evt_widx)
            ] for ii in range(slen)]
            stat["ftok_corr"] += sum(a == b for a, b in zip(g_paths, p_paths))
            # assign
            g_evt.info["dpaths"] = [g_paths, p_paths
                                    ]  # [2(g/p), SLEN, 2(word, predicate)]
        # --
    # --
    # report
    OtherHelper.printd(stat)
    zlog(
        f"FtokPathAcc: {stat['ftok_corr']} / {stat['ftok']} = {stat['ftok_corr']/stat['ftok']}"
    )
    # --
    # write
    if conf.output.output_path:
        with conf.output.get_writer() as writer:
            writer.write_insts(final_insts)
Beispiel #3
0
 def do_eval(self, code: str, mname: str = ""):
     s, m, vs = self, OtherHelper.get_module(
         self), self.vars  # convenient local variable
     if mname:
         import importlib
         m2 = importlib.import_module(mname)
     ret = eval(code)
     return ret
Beispiel #4
0
 def do_sort(self, insts_target: str, kcode: str) -> List:
     s, m, vs = self, OtherHelper.get_module(self), self.vars
     _ff = compile(kcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     tmp_tuples = [(d, eval(_ff)) for d in insts]
     tmp_tuples.sort(key=lambda x: x[1])
     ret = [x[0] for x in tmp_tuples]
     zlog(f"Sort by key={kcode}: len = {len(ret)}")
     return ret
Beispiel #5
0
 def do_cal_pd(self, inst_pd: str, scode: str):
     s, m, vs = self, OtherHelper.get_module(self), self.vars
     _ff = compile(scode, "", "eval")
     d = self.get_and_check_type(inst_pd, pd.DataFrame)
     # --
     ret = eval(_ff)
     zlog(
         f"Calculation on pd.DataFrame by {scode}, and get another one as: {str(ret)}"
     )
     return ret
Beispiel #6
0
def main(*args):
    conf: MainConf = init_everything(MainConf(), args)
    # --
    # first read them all
    src_sents, trg_sents = list(yield_sents(conf.src_input.get_reader())), \
                           list(yield_sents(conf.trg_input.get_reader()))
    assert len(src_sents) == len(trg_sents)
    cc = Counter()
    conv = Converter(conf)
    # --
    outputs = []
    for src_sent, trg_sent in zip(src_sents, trg_sents):
        res = conv.convert(src_sent, trg_sent, cc)
        outputs.append(res)
    zlog("Stat:")
    OtherHelper.printd(cc)
    # --
    with conf.output.get_writer() as writer:
        writer.write_insts(outputs)
Beispiel #7
0
 def do_get_pd(self, insts_target: str, gcode: str) -> pd.DataFrame:
     s, m, vs = self, OtherHelper.get_module(self), self.vars
     _ff = compile(gcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     # --
     fields = [eval(_ff) for d in insts]
     ret = pd.DataFrame(fields)
     zlog(
         f"Group {len(insts)} instances by {gcode} to pd.DataFrame shape={ret.shape}."
     )
     return ret
Beispiel #8
0
 def do_ann_attach(self, name: str):
     s, m, vs = self, OtherHelper.get_module(self), self.vars
     # --
     # todo(note): keep this special name for this special purpose
     if name == "_detach":
         self._cur_ann_task = None
         self.set_var("_cur_ann_var_name", None)
         return
     # --
     z = self.get_and_check_type(name, AnnotationTask)
     zlog(f"Attach ann_task: from {self.cur_ann_task} to {z}")
     self.set_var("_cur_ann_var_name", z)  # set special name!!
Beispiel #9
0
 def do_join(self, insts_target: str, jcode: str) -> List:
     s, m, vs = self, OtherHelper.get_module(self), self.vars
     _ff = compile(jcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     ret = []
     for d in insts:
         ret0 = eval(_ff)
         ret.extend(ret0)
     # ret0 = [eval(_ff) for d in insts]
     # ret = list(chain.from_iterable(ret0))
     zlog(f"Join-list by {jcode}: from {len(insts)} to {len(ret)}")
     return ret
Beispiel #10
0
 def do_filter(self, insts_target: str, fcode: str) -> List:
     s, m, vs = self, OtherHelper.get_module(self), self.vars
     _ff = compile(fcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     ret = []
     for d in insts:
         if eval(_ff):
             ret.append(d)
     # ret = [d for d in insts if eval(_ff)]
     zlog(
         f"Filter by {fcode}: from {len(insts)} to {len(ret)}, {len(ret)/(len(insts)+1e-7)}"
     )
     return ret
Beispiel #11
0
def main(input_path):
    insts = list(ReaderGetterConf().get_reader(
        input_path=input_path))  # read from stdin
    all_sents = list(yield_sents(insts))
    set_ee_heads(insts)
    # --
    cc = Counter()
    for sent in all_sents:
        cc["sent"] += 1
        arg_maps = [[] for _ in range(len(sent))]
        for evt in sent.events:
            cc["evt"] += 1
            for arg in evt.args:
                # --
                # no VERB
                if arg.role in ["V", "C-V"]:
                    cc["argV"] += 1
                    continue
                # --
                cc["arg"] += 1
                ef = arg.arg
                shidx = ef.mention.shead_widx
                span = ef.mention.get_span()
                arg_maps[shidx].append(ZObject(evt=evt, ef=ef, span=span))
        # check for all tokens
        cc["tok"] += len(arg_maps)
        for one_objs in arg_maps:
            cc[f"tok_N{len(one_objs)}"] += 1
            all_spans = set(z.span for z in one_objs)
            cc[f"tok_N{len(one_objs)}S{len(all_spans)}"] += 1
            # --
            if len(one_objs) > 0:
                cc[f"tok_diff={len(all_spans)>1}"] += 1
            if len(all_spans) > 1:
                breakpoint()
                pass
        # --
    # --
    OtherHelper.printd(cc)
Beispiel #12
0
 def do_corr(self, insts_target: str, acode: str, bcode: str):
     s, m, vs = self, OtherHelper.get_module(self), self.vars
     _ffa = compile(acode, "", "eval")
     _ffb = compile(bcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     a_vals, b_vals = [], []
     for d in insts:
         a_vals.append(eval(_ffa))
         b_vals.append(eval(_ffb))
     # --
     from scipy.stats import pearsonr, spearmanr
     zlog(f"Pearson={pearsonr(a_vals,b_vals)}")
     zlog(f"Spearman={spearmanr(a_vals,b_vals)}")
     return None
Beispiel #13
0
 def do_break_eval(self,
                   insts_target: str,
                   pcode: str,
                   gcode: str,
                   corr_code="d.pred.label == d.gold.label",
                   sort_key='-1',
                   truncate_items=100,
                   pdb=False):
     s, m, vs = self, OtherHelper.get_module(self), self.vars
     sort_key = int(sort_key)
     _fp, _fg = compile(pcode, "", "eval"), compile(gcode, "", "eval")
     _fcorr = compile(corr_code, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     # --
     res = {}
     for d in insts:
         corr = 0
         # --
         no_pred = False
         try:  # use try/except to set this!
             key_p = eval(_fp)
         except:
             no_pred = True
         # --
         if not no_pred and d.gold is not None:
             corr = eval(_fcorr)
         if not no_pred:
             key_p = eval(_fp)
             if key_p not in res:
                 res[key_p] = F1EvalEntry()
             res[key_p].record_p(int(corr))
         if d.gold is not None:
             key_g = eval(_fg)
             if key_g not in res:
                 res[key_g] = F1EvalEntry()
             res[key_g].record_r(int(corr))
     # final
     details = [(k, ) + v.details for k, v in res.items()]
     details = sorted(details, key=(lambda x: x[sort_key]), reverse=True)
     # --
     pdf = pd.DataFrame(details)
     pdf_str = pdf[:int(truncate_items)].to_string()
     zlog(
         f"Break-eval {len(insts)} instances by {pcode}/{gcode}:\n{pdf_str}"
     )
     if pdb:
         breakpoint()
     return res
Beispiel #14
0
 def _do_group(self, insts_target: str, gcode: str, sum_key: str,
               visitor: RecordNodeVisitor) -> RecordNode:
     s, m, vs = self, OtherHelper.get_module(self), self.vars
     _ff = compile(gcode, "", "eval")
     insts = self.get_and_check_type(insts_target, list)
     # collect all seqs
     ret = RecordNode.new_root()
     for d in insts:
         ret.record_seq(eval(_ff), obj=d)
     # visitor
     if visitor is not None:
         try:
             ret.rec_visit(visitor)
         except:
             zlog(traceback.format_exc())
             zlog("Error of visitor.")
     # some slight summaries here
     all_count = len(insts)
     if not str.isidentifier(sum_key):
         sum_key = eval(sum_key)  # eval the lambda expression
     all_nodes = ret.get_descendants(key=sum_key)
     ss = []
     for z in all_nodes:
         all_parents = z.get_antecedents()
         if len(all_parents) > 0:
             assert all_parents[0].count == all_count
         perc_info = ', '.join(
             [f"{z.count/(zp.count+1e-6):.4f}" for zp in all_parents])
         ss.append([
             '==' * len(z.path),
             str(z.path), f"{z.count}({perc_info})",
             z.get_content()
         ])
     # sstr = "\n".join(ss)
     # sstr = ""
     # pd.set_option('display.width', 1000)
     # pd.set_option('display.max_colwidth', 1000)
     pdf = pd.DataFrame(ss)
     pdf_str = pdf.to_string()
     zlog(
         f"Group {len(insts)} instances by {gcode}, all {len(ss)} nodes:\n{pdf_str}"
     )
     return ret
Beispiel #15
0
 def do_ann_new(self, insts_target: str, fcode: str = None, try_attach=1):
     s, m, vs = self, OtherHelper.get_module(self), self.vars
     # --
     assert self._cur_cmd.target is not None, "Should assign this to a var to avoid accidental loss!"
     vs = self.vars
     insts = self.get_and_check_type(insts_target, list)
     if fcode is None:
         new_task = self.__class__.get_ann_type()(insts)
     else:
         new_task = eval(fcode)(insts)
     # todo(note): here need auto save?
     try_attach = bool(int(try_attach))
     if try_attach:
         if self.cur_ann_task is not None:
             zlog("Detach current task and try attach the new one!!")
             self._cur_ann_task = None
         # note: directly set name, which will be assigned later
         # todo(+N): maybe source of certain bugs?
         self.set_var("_cur_ann_var_name",
                      self._cur_cmd.target)  # set special name!!
         zlog("New ann task, and ann_var_name set!")
     return new_task