Exemple #1
0
 def __init__(self, conf: AnalyzerConf):
     self.conf = conf
     self.history: List[AnalyzerCommand] = []
     # naming convention: those start with "_" will not be saved
     self.vars = ZObject()  # vars mapping
     self.traces = {}  # var_name -> (explanation, history-idx)
     # tmp ones
     self._cur_cmd: AnalyzerCommand = None
     self._cur_ann_task: AnnotationTask = None
Exemple #2
0
 def _make_repeats(self, repeat: int):
     assert repeat >= 1
     expr_main, expr_pair, input_mask, extra_score = self.expr_main, self.expr_pair, self.input_mask, self.extra_score
     if repeat != 1:  # need actual repeat: [*xR, ...]
         expr_main = BK.simple_repeat_interleave(expr_main, repeat, 0)
         expr_pair = None if expr_pair is None else BK.simple_repeat_interleave(
             expr_pair, repeat, 0)
         input_mask = BK.simple_repeat_interleave(input_mask, repeat, 0)
         extra_score = None if extra_score is None else BK.simple_repeat_interleave(
             extra_score, repeat, 0)
     z = ZObject()
     z.expr_main, z.expr_pair = expr_main, expr_pair  # [*, slen, Dm'], [*, Dp']
     z.main_slices = split_at_dim(z.expr_main, -2,
                                  False)  # List of [*, Dm']
     z.mask_slices = split_at_dim(input_mask, -1, False)  # List of [*]
     if extra_score is None:
         z.extra_score_slices = [None] * len(z.main_slices)
     else:
         z.extra_score_slices = split_at_dim(extra_score, -2,
                                             False)  # List of [*, L]
         if len(z.extra_score_slices) < len(z.main_slices):  # broadcast!
             assert len(z.extra_score_slices) == 1
             z.extra_score_slices = z.extra_score_slices * len(
                 z.main_slices)
     return z
Exemple #3
0
 def _get_dpath_objects(self, fl, max_depth: int):
     ret = []
     for f in fl:
         if any(z is None for z in f):
             continue  # filter out the unmatched ones!
         # --
         # first get has args from gold!!
         gold_slen = len(f.gold.sent)
         gold_widx = f.gold.mention.shead_widx
         gold_has_args = [False] * gold_slen
         for arg in f.gold.args:
             arg_widx, arg_wlen = arg.mention.get_span()
             gold_has_args[arg_widx:arg_widx + arg_wlen] = [True] * arg_wlen
         # then get each one's dep-path
         all_paths = []
         for one_frame in f:
             assert len(
                 one_frame.sent
             ) == gold_slen and one_frame.mention.shead_widx == gold_widx
             one_paths = self._get_dpaths(one_frame, max_depth)
             all_paths.append(one_paths)
         # --
         for ii in range(gold_slen):
             ret.append(
                 ZObject(has_arg=gold_has_args[ii],
                         ps=[z[ii] for z in all_paths]))
     return ret
Exemple #4
0
 def _prep_sent(self, sent: Sent):
     conf: MySRLConf = self.conf
     slen = len(sent)
     _loss_weight_non = getattr(sent, "_loss_weight_non", 1.)  # todo(+N): special name; loss_weight_non
     # note: for simplicity, assume no loss_weight_non for args
     # first for events
     evt_arr = np.full([slen], 0, dtype=np.int)  # [evt]
     arg_arr = np.full([slen, slen], 0, dtype=np.int)  # [evt, arg]
     evt_items = np.full([slen], None, dtype=object)  # [evt]
     for f in sent.get_frames(conf.evt_ftag):  # note: assume no overlapping
         # predicate
         evt_widx, evt_wlen = self.evt_span_getter(f.mention)
         evt_label = f.label_idx
         assert evt_wlen==1 and evt_label>0, "For simplicity!!"
         evt_items[evt_widx] = f
         evt_arr[evt_widx] = evt_label
         # arguments
         if conf.arg_only_rank1:
             cur_args = [a for a in f.args if a.info.get("rank", 1) == 1]
         else:
             cur_args = f.args
         # bio or not
         if conf.arg_use_bio:  # special
             arg_spans = [self.arg_span_getter(a.mention) + (a.label_idx,) for a in cur_args]
             tag_layers = self.vocab_arg.spans2tags_idx(arg_spans, slen)
             if len(tag_layers) > 1:
                 zwarn(f"Warning: 'Full args require multiple layers with {arg_spans}")
             arg_arr[evt_widx, :] = tag_layers[0][0]  # directly assign it!
         else:  # plain ones
             for a in cur_args:
                 arg_role = a.label_idx
                 arg_widx, arg_wlen = self.arg_span_getter(a.mention)
                 arg_arr[evt_widx, arg_widx:arg_widx+arg_wlen] = arg_role
     return ZObject(sent=sent, slen=slen, loss_weight_non=_loss_weight_non,
                    evt_items=evt_items, evt_arr=evt_arr, arg_arr=arg_arr)
Exemple #5
0
 def _prep_inst(self, inst):
     sent = inst.sent
     slen = len(sent)
     # --
     tree = sent.tree_dep
     # first on depth (normalized by max-depth)
     tree_depths = tree.depths
     max_depth = max(tree_depths)
     assert max_depth >= 1
     # note: real-root got 1, leaf got 0, others in between
     _tmp_depth = np.asarray(tree.depths)
     depth_arr = 1. - (_tmp_depth-1) / max(1, max_depth-1)  # real-root's depth==1
     depth_arr *= self.conf.depth_nonroot_space  # squeeze non-root!
     depth_arr[_tmp_depth==1] = 1.  # reset the root!
     # then on labels
     _lab_arr = tree.label_matrix  # (m,h) [slen, slen+1]
     udep_arr = _lab_arr[:, 1:]  # (m,h) [slen, slen], arti-root not included!
     return ZObject(slen=slen, depth_arr=depth_arr, udep_arr=udep_arr)
Exemple #6
0
 def _prep_items(self, items: List, par: object, seq_len: int):
     vocab: SeqVocab = self.vocab
     # --
     core_spans = [
         self.core_span_getter(f.mention) + (f.label_idx, ) for f in items
     ]
     _loss_weight_non = getattr(
         par, "_loss_weight_non",
         1.)  # todo(+N): special name; loss_weight_non
     tag_layers = vocab.spans2tags_idx(core_spans, seq_len)
     if len(tag_layers) > 1:
         zwarn(
             f"Warning: '{self.conf.ftag}' only use layer0 but the full needs multiple layers with {core_spans}"
         )
         # breakpoint()
     trg_tags = tag_layers[0][0]
     # trg_first_items = [(items[i] if i>=0 else None) for i in tag_layers[0][1]]  # note: put it at the start!
     # return ZObject(loss_weight_non=_loss_weight_non, first_items=trg_first_items, tags=trg_tags, len=len(trg_tags))
     return ZObject(loss_weight_non=_loss_weight_non,
                    tags=trg_tags,
                    len=len(trg_tags))
Exemple #7
0
 def _prep_items(self, items: List, par: object, seq_len: int):
     # sort items by (wlen, widx): larger span first!
     aug_items = []
     for f in items:
         widx, wlen = self.core_span_getter(f.mention)
         aug_items.append(((-wlen, widx, f.label_idx), f))  # key, item
     aug_items.sort(key=lambda x: x[0])
     # get them
     ret_items = [z[1] for z in aug_items]  # List[item]
     seq_iidxes = [-1] * seq_len  # List[idx-item]
     seq_labs = [0] * seq_len  # List[lab-idx]
     group_widxes = []  # List[List[idx-word]]
     for ii, pp in enumerate(aug_items):
         neg_wlen, widx, lab_idx = pp[0]
         wlen = -neg_wlen
         seq_iidxes[widx:widx + wlen] = [ii] * wlen  # assign iidx
         seq_labs[widx:widx + wlen] = [lab_idx] * wlen  # assign lab
     # --
     cur_ii, last_jj = [], -1
     for ii, jj in enumerate(
             seq_iidxes
     ):  # note: need another loop since there can be overlap
         if jj != last_jj or jj < 0:  # break
             group_widxes.extend([cur_ii] * len(cur_ii))
             cur_ii = []
         cur_ii.append(ii)
         last_jj = jj
     group_widxes.extend([cur_ii] * len(cur_ii))
     assert len(group_widxes) == seq_len
     # --
     _loss_weight_non = getattr(
         par, "_loss_weight_non",
         1.)  # todo(+N): special name; loss_weight_non
     return ZObject(items=ret_items,
                    par=par,
                    len=seq_len,
                    loss_weight_non=_loss_weight_non,
                    seq_iidxes=seq_iidxes,
                    seq_labs=seq_labs,
                    group_widxes=group_widxes)
Exemple #8
0
def main(input_path):
    insts = list(ReaderGetterConf().get_reader(
        input_path=input_path))  # read from stdin
    all_sents = list(yield_sents(insts))
    set_ee_heads(insts)
    # --
    cc = Counter()
    for sent in all_sents:
        cc["sent"] += 1
        arg_maps = [[] for _ in range(len(sent))]
        for evt in sent.events:
            cc["evt"] += 1
            for arg in evt.args:
                # --
                # no VERB
                if arg.role in ["V", "C-V"]:
                    cc["argV"] += 1
                    continue
                # --
                cc["arg"] += 1
                ef = arg.arg
                shidx = ef.mention.shead_widx
                span = ef.mention.get_span()
                arg_maps[shidx].append(ZObject(evt=evt, ef=ef, span=span))
        # check for all tokens
        cc["tok"] += len(arg_maps)
        for one_objs in arg_maps:
            cc[f"tok_N{len(one_objs)}"] += 1
            all_spans = set(z.span for z in one_objs)
            cc[f"tok_N{len(one_objs)}S{len(all_spans)}"] += 1
            # --
            if len(one_objs) > 0:
                cc[f"tok_diff={len(all_spans)>1}"] += 1
            if len(all_spans) > 1:
                breakpoint()
                pass
        # --
    # --
    OtherHelper.printd(cc)
Exemple #9
0
class Analyzer(Registrable):
    def __init__(self, conf: AnalyzerConf):
        self.conf = conf
        self.history: List[AnalyzerCommand] = []
        # naming convention: those start with "_" will not be saved
        self.vars = ZObject()  # vars mapping
        self.traces = {}  # var_name -> (explanation, history-idx)
        # tmp ones
        self._cur_cmd: AnalyzerCommand = None
        self._cur_ann_task: AnnotationTask = None

    @classmethod
    def get_ann_type(cls):
        raise NotImplementedError()

    @property
    def cur_ann_task(self):
        if self._cur_ann_task is None:
            try:
                cur_ann_var_name = self.get_var("_cur_ann_var_name")
                self._cur_ann_task = self.get_var(cur_ann_var_name)
            except:
                return None
        return self._cur_ann_task

    def __del__(self):
        auto_save_name = self.conf.auto_save_name
        if len(auto_save_name) > 0:
            self.do_save(self.conf.auto_save_name)

    # =====
    # some helper functions
    def set_var(self, target: str, v: object, explanation=None):
        if hasattr(self.vars, target):
            zlog(f"Overwriting the existing var `{target}'")
        if target not in self.traces:
            self.traces[target] = []
        # (explanation, history-idx)
        self.traces[target].append((explanation, len(self.history)))
        setattr(self.vars, target, v)

    def get_var(self, target: str):
        if hasattr(self.vars, target):
            return getattr(self.vars, target)
        else:
            assert False, f"Cannot find var `{target}`"

    def get_history(self, n: int):
        if n >= 0 and n < len(self.history):
            return self.history[n]
        else:
            return None

    def get_and_check_type(self, target: str, dtype: Type):
        ret = self.get_var(target)
        assert isinstance(
            ret, dtype), f"Wrong typed target, {type(ret)} instead of {dtype}"
        return ret

    # =====
    # some general commands
    def do_history(self, num=-10):
        num = int(num)
        cur_len = len(self.history)
        if num < 0:  # listing recent mode
            num = min(int(-num), cur_len)
            zlog(f"Listing histories: all = {cur_len}")
            # back to front
            for i in range(1, num + 1):
                real_idx = cur_len - i
                zlog(f"[#{i}|{real_idx}] {self.get_history(real_idx)}")
        else:
            zlog(f"Listing histories idx = {num}: {self.get_history(num)}")
        return None

    def do_trace(self, target: str):
        v = self.get_var(target)  # check existence
        for explanation, history_idx in self.traces[target]:
            zlog(
                f"Var `{target}`: ({explanation}, [{history_idx}]{self.get_history(history_idx)})"
            )

    # a general runner
    def do_eval(self, code: str, mname: str = ""):
        s, m, vs = self, OtherHelper.get_module(
            self), self.vars  # convenient local variable
        if mname:
            import importlib
            m2 = importlib.import_module(mname)
        ret = eval(code)
        return ret

    def do_pdb(self):
        from pdb import set_trace
        set_trace()
        return None

    # note: no load & save history related ones since hard to maintain if loading other's state?
    def do_load(self, file: str):
        zlog(f"Try loading vars from {file}")
        x = default_pickle_serializer.from_file(file)
        self.vars.update(x)  # note: update rather than replace!!

    def do_save(self, file: str):
        zlog(f"Try saving vars to {file}")
        default_pickle_serializer.to_file(self.vars, file)

    # =====
    # some useful ones

    # protocol: target: instances, fcode: d for each instances; return instances
    def do_filter(self, insts_target: str, fcode: str) -> List:
        s, m, vs = self, OtherHelper.get_module(self), self.vars
        _ff = compile(fcode, "", "eval")
        insts = self.get_and_check_type(insts_target, list)
        ret = []
        for d in insts:
            if eval(_ff):
                ret.append(d)
        # ret = [d for d in insts if eval(_ff)]
        zlog(
            f"Filter by {fcode}: from {len(insts)} to {len(ret)}, {len(ret)/(len(insts)+1e-7)}"
        )
        return ret

    # protocol: target of instances, jcode return a list for each of them
    def do_join(self, insts_target: str, jcode: str) -> List:
        s, m, vs = self, OtherHelper.get_module(self), self.vars
        _ff = compile(jcode, "", "eval")
        insts = self.get_and_check_type(insts_target, list)
        ret = []
        for d in insts:
            ret0 = eval(_ff)
            ret.extend(ret0)
        # ret0 = [eval(_ff) for d in insts]
        # ret = list(chain.from_iterable(ret0))
        zlog(f"Join-list by {jcode}: from {len(insts)} to {len(ret)}")
        return ret

    # protocol: target: instances, kcode: (key) d for inst: return sorted list
    def do_sort(self, insts_target: str, kcode: str) -> List:
        s, m, vs = self, OtherHelper.get_module(self), self.vars
        _ff = compile(kcode, "", "eval")
        insts = self.get_and_check_type(insts_target, list)
        tmp_tuples = [(d, eval(_ff)) for d in insts]
        tmp_tuples.sort(key=lambda x: x[1])
        ret = [x[0] for x in tmp_tuples]
        zlog(f"Sort by key={kcode}: len = {len(ret)}")
        return ret

    # protocol: target: instances, gcode: d for each instances; return one node
    def do_group(self,
                 insts_target: str,
                 gcode: str,
                 sum_key="count") -> RecordNode:
        return self._do_group(insts_target, gcode, sum_key, None)

    # protocol: target: instances, gcode: d for each instances; return one node
    def _do_group(self, insts_target: str, gcode: str, sum_key: str,
                  visitor: RecordNodeVisitor) -> RecordNode:
        s, m, vs = self, OtherHelper.get_module(self), self.vars
        _ff = compile(gcode, "", "eval")
        insts = self.get_and_check_type(insts_target, list)
        # collect all seqs
        ret = RecordNode.new_root()
        for d in insts:
            ret.record_seq(eval(_ff), obj=d)
        # visitor
        if visitor is not None:
            try:
                ret.rec_visit(visitor)
            except:
                zlog(traceback.format_exc())
                zlog("Error of visitor.")
        # some slight summaries here
        all_count = len(insts)
        if not str.isidentifier(sum_key):
            sum_key = eval(sum_key)  # eval the lambda expression
        all_nodes = ret.get_descendants(key=sum_key)
        ss = []
        for z in all_nodes:
            all_parents = z.get_antecedents()
            if len(all_parents) > 0:
                assert all_parents[0].count == all_count
            perc_info = ', '.join(
                [f"{z.count/(zp.count+1e-6):.4f}" for zp in all_parents])
            ss.append([
                '==' * len(z.path),
                str(z.path), f"{z.count}({perc_info})",
                z.get_content()
            ])
        # sstr = "\n".join(ss)
        # sstr = ""
        # pd.set_option('display.width', 1000)
        # pd.set_option('display.max_colwidth', 1000)
        pdf = pd.DataFrame(ss)
        pdf_str = pdf.to_string()
        zlog(
            f"Group {len(insts)} instances by {gcode}, all {len(ss)} nodes:\n{pdf_str}"
        )
        return ret

    # filter + group
    def do_fg(self, insts_target: str, fcode: str, gcode: str, **g_kwargs):
        f_res = self.do_filter(insts_target, fcode)
        self.set_var(self.conf.last_var, f_res)  # store tmp
        g_res = self.do_group(self.conf.last_var, gcode, **g_kwargs)
        return g_res

    # correlation
    def do_corr(self, insts_target: str, acode: str, bcode: str):
        s, m, vs = self, OtherHelper.get_module(self), self.vars
        _ffa = compile(acode, "", "eval")
        _ffb = compile(bcode, "", "eval")
        insts = self.get_and_check_type(insts_target, list)
        a_vals, b_vals = [], []
        for d in insts:
            a_vals.append(eval(_ffa))
            b_vals.append(eval(_ffb))
        # --
        from scipy.stats import pearsonr, spearmanr
        zlog(f"Pearson={pearsonr(a_vals,b_vals)}")
        zlog(f"Spearman={spearmanr(a_vals,b_vals)}")
        return None

    # similar to group, but using pd instead
    def do_get_pd(self, insts_target: str, gcode: str) -> pd.DataFrame:
        s, m, vs = self, OtherHelper.get_module(self), self.vars
        _ff = compile(gcode, "", "eval")
        insts = self.get_and_check_type(insts_target, list)
        # --
        fields = [eval(_ff) for d in insts]
        ret = pd.DataFrame(fields)
        zlog(
            f"Group {len(insts)} instances by {gcode} to pd.DataFrame shape={ret.shape}."
        )
        return ret

    # calculation and manipulation on pd, also assuming the local name is d
    def do_cal_pd(self, inst_pd: str, scode: str):
        s, m, vs = self, OtherHelper.get_module(self), self.vars
        _ff = compile(scode, "", "eval")
        d = self.get_and_check_type(inst_pd, pd.DataFrame)
        # --
        ret = eval(_ff)
        zlog(
            f"Calculation on pd.DataFrame by {scode}, and get another one as: {str(ret)}"
        )
        return ret

    # --
    # do breakdown and eval

    # shortcut!
    def do_break_eval2(self,
                       insts_target: str,
                       pcode: str,
                       gcode: str,
                       corr_code="d.pred.label == d.gold.label",
                       pint=0,
                       **kwargs):
        pcode2 = pcode.replace("d.pred", f"d.preds[{pint}]")
        corr_code2 = corr_code.replace("d.pred", f"d.preds[{pint}]")
        return self.do_break_eval(insts_target, pcode2, gcode, corr_code2,
                                  **kwargs)

    # real go!
    def do_break_eval(self,
                      insts_target: str,
                      pcode: str,
                      gcode: str,
                      corr_code="d.pred.label == d.gold.label",
                      sort_key='-1',
                      truncate_items=100,
                      pdb=False):
        s, m, vs = self, OtherHelper.get_module(self), self.vars
        sort_key = int(sort_key)
        _fp, _fg = compile(pcode, "", "eval"), compile(gcode, "", "eval")
        _fcorr = compile(corr_code, "", "eval")
        insts = self.get_and_check_type(insts_target, list)
        # --
        res = {}
        for d in insts:
            corr = 0
            # --
            no_pred = False
            try:  # use try/except to set this!
                key_p = eval(_fp)
            except:
                no_pred = True
            # --
            if not no_pred and d.gold is not None:
                corr = eval(_fcorr)
            if not no_pred:
                key_p = eval(_fp)
                if key_p not in res:
                    res[key_p] = F1EvalEntry()
                res[key_p].record_p(int(corr))
            if d.gold is not None:
                key_g = eval(_fg)
                if key_g not in res:
                    res[key_g] = F1EvalEntry()
                res[key_g].record_r(int(corr))
        # final
        details = [(k, ) + v.details for k, v in res.items()]
        details = sorted(details, key=(lambda x: x[sort_key]), reverse=True)
        # --
        pdf = pd.DataFrame(details)
        pdf_str = pdf[:int(truncate_items)].to_string()
        zlog(
            f"Break-eval {len(insts)} instances by {pcode}/{gcode}:\n{pdf_str}"
        )
        if pdb:
            breakpoint()
        return res

    # --
    # ann related
    def do_ann_attach(self, name: str):
        s, m, vs = self, OtherHelper.get_module(self), self.vars
        # --
        # todo(note): keep this special name for this special purpose
        if name == "_detach":
            self._cur_ann_task = None
            self.set_var("_cur_ann_var_name", None)
            return
        # --
        z = self.get_and_check_type(name, AnnotationTask)
        zlog(f"Attach ann_task: from {self.cur_ann_task} to {z}")
        self.set_var("_cur_ann_var_name", z)  # set special name!!

    def do_ann_new(self, insts_target: str, fcode: str = None, try_attach=1):
        s, m, vs = self, OtherHelper.get_module(self), self.vars
        # --
        assert self._cur_cmd.target is not None, "Should assign this to a var to avoid accidental loss!"
        vs = self.vars
        insts = self.get_and_check_type(insts_target, list)
        if fcode is None:
            new_task = self.__class__.get_ann_type()(insts)
        else:
            new_task = eval(fcode)(insts)
        # todo(note): here need auto save?
        try_attach = bool(int(try_attach))
        if try_attach:
            if self.cur_ann_task is not None:
                zlog("Detach current task and try attach the new one!!")
                self._cur_ann_task = None
            # note: directly set name, which will be assigned later
            # todo(+N): maybe source of certain bugs?
            self.set_var("_cur_ann_var_name",
                         self._cur_cmd.target)  # set special name!!
            zlog("New ann task, and ann_var_name set!")
        return new_task

    # =====
    # main ones

    def process(self, cmd: AnalyzerCommand):
        args = cmd.args
        cmd_name, real_args = args[0], args[1:]
        # similar to the cmd package
        method_name = "do_" + cmd_name
        # first check ann then check self!
        cur_ann_task = self.cur_ann_task
        if cur_ann_task is not None and hasattr(cur_ann_task, method_name):
            zlog(f"Performing annotator's command: {cmd}")
            return getattr(cur_ann_task, method_name)(*real_args, **cmd.kwargs)
        else:
            assert hasattr(self, method_name), f"Unknown command {cmd_name}"
            zlog(f"Performing command: {cmd}")
            return getattr(self, method_name)(*real_args, **cmd.kwargs)

    def loop(self, file: str = None):
        if file is None:  # use pre-set
            parser = CmdLineParser(self.conf.cmd_conf)
        else:
            parser = CmdLineParser(self.conf.cmd_conf, cmd_input=file)
        # loop
        for res in parser:
            cmd = AnalyzerCommand(*res)
            self._cur_cmd = cmd
            # process it
            if self.conf.raise_error:
                v = self.process(cmd)
            else:
                try:
                    v = self.process(cmd)
                except AssertionError as e:
                    zlog(f"Checking Error: " + str(e))
                    continue
                except:
                    zlog(f"Command error: " + str(traceback.format_exc()))
                    continue
            self._cur_cmd = None  # reset
            cmd_count = len(self.history)
            if v is not None:  # does not store None
                if cmd.target is not None:
                    self.set_var(cmd.target, v)
                # also store special VAR!
                self.set_var(self.conf.last_var, v)
            self.history.append(cmd)
            zlog(f"Finish command #{cmd_count}: {cmd}")

    def main(self, *args, **kwargs):
        self.loop()
Exemple #10
0
 def lu2feat(self, lu_name: str):  # from fn styled LU to feat
     lu_lemma, lu_pos = lu_name.rsplit(".", 1)
     t = ZObject(lemma=lu_lemma, upos=FN2UD_POS_MAP.get(lu_pos, "X"))
     return self.lex_feat_f(t)
Exemple #11
0
 def _prep_items(self, items: List, par: object):
     core_spans = [self.core_span_getter(f.mention) for f in items]
     _loss_weight_non = getattr(par, "_loss_weight_non", 1.)  # todo(+N): special name; loss_weight_non
     return ZObject(items=items, par=par, len=len(items), loss_weight_non=_loss_weight_non,
                    core_widxes=[z[0] for z in core_spans], core_wlens=[z[1] for z in core_spans])