Example #1
0
 def _prep_sent(self, sent: Sent):
     conf: MySRLConf = self.conf
     slen = len(sent)
     _loss_weight_non = getattr(sent, "_loss_weight_non", 1.)  # todo(+N): special name; loss_weight_non
     # note: for simplicity, assume no loss_weight_non for args
     # first for events
     evt_arr = np.full([slen], 0, dtype=np.int)  # [evt]
     arg_arr = np.full([slen, slen], 0, dtype=np.int)  # [evt, arg]
     evt_items = np.full([slen], None, dtype=object)  # [evt]
     for f in sent.get_frames(conf.evt_ftag):  # note: assume no overlapping
         # predicate
         evt_widx, evt_wlen = self.evt_span_getter(f.mention)
         evt_label = f.label_idx
         assert evt_wlen==1 and evt_label>0, "For simplicity!!"
         evt_items[evt_widx] = f
         evt_arr[evt_widx] = evt_label
         # arguments
         if conf.arg_only_rank1:
             cur_args = [a for a in f.args if a.info.get("rank", 1) == 1]
         else:
             cur_args = f.args
         # bio or not
         if conf.arg_use_bio:  # special
             arg_spans = [self.arg_span_getter(a.mention) + (a.label_idx,) for a in cur_args]
             tag_layers = self.vocab_arg.spans2tags_idx(arg_spans, slen)
             if len(tag_layers) > 1:
                 zwarn(f"Warning: 'Full args require multiple layers with {arg_spans}")
             arg_arr[evt_widx, :] = tag_layers[0][0]  # directly assign it!
         else:  # plain ones
             for a in cur_args:
                 arg_role = a.label_idx
                 arg_widx, arg_wlen = self.arg_span_getter(a.mention)
                 arg_arr[evt_widx, arg_widx:arg_widx+arg_wlen] = arg_role
     return ZObject(sent=sent, slen=slen, loss_weight_non=_loss_weight_non,
                    evt_items=evt_items, evt_arr=evt_arr, arg_arr=arg_arr)
Example #2
0
 def convert(self, src_sent: Sent, trg_sent: Sent, cc: Counter):
     cc["sent"] += 1
     assert len(src_sent) == len(trg_sent)
     src_tree = src_sent.tree_dep
     trg_tree = trg_sent.tree_dep
     # --
     # copy trg sent
     ret = Sent.create(trg_sent.seq_word.vals.copy())
     if trg_sent.seq_upos is not None:
         ret.build_uposes(trg_sent.seq_upos.vals)
     ret.build_dep_tree(trg_tree.seq_head.vals, trg_tree.seq_label.vals)
     # --
     # map items
     # first get everyone's desc set
     src_desc = self.get_desc(src_tree)
     trg_desc = self.get_desc(trg_tree)
     for src_evt in src_sent.events:
         cc["evt"] += 1
         _ewidx, _ewlen = src_evt.mention.get_span()
         assert _ewlen == 1
         trg_evt = ret.make_event(_ewidx, _ewlen, type=src_evt.type)
         for src_arg in src_evt.args:
             cc["arg"] += 1
             _awidx, _awlen = src_arg.mention.get_span()
             assert _awlen == 1
             _new_awidx = self.ff(_ewidx, _awidx, src_tree, trg_tree,
                                  src_desc, trg_desc, cc)
             trg_ef = ret.make_entity_filler(_new_awidx,
                                             1,
                                             type=src_arg.arg.type)
             trg_evt.add_arg(trg_ef, src_arg.role)
     # --
     return ret
Example #3
0
    def semafor2sent(d: Dict):
        tokens = d["tokens"]
        ret = Sent.create(words=tokens)

        # -----
        def _read_mention(_spans):
            assert len(_spans) == 1, "Assume single span!"
            _span = _spans[0]
            _start, _end, _text = _span["start"], _span["end"], _span["text"]
            assert StrHelper.delete_spaces(_text) == StrHelper.delete_spaces(
                ''.join(tokens[_start:_end]))  # check without spaces
            return _start, _end - _start  # widx, wlen

        # -----
        for frame in d["frames"]:
            frame_target, frame_asets = frame["target"], frame[
                "annotationSets"]
            # target
            evt_widx, evt_wlen = _read_mention(frame_target["spans"])
            evt = ret.make_event(evt_widx, evt_wlen, type=frame_target["name"])
            # roles
            assert len(frame_asets) == 1 and frame_asets[0][
                "rank"] == 0, "Assume only one rank=0 annotationSets!"
            for frame_role in frame_asets[0]["frameElements"]:
                ef_widx, ef_wlen = _read_mention(frame_role["spans"])
                ef = ret.make_entity_filler(
                    ef_widx, ef_wlen)  # make new ef for each arg
                evt.add_arg(ef, role=frame_role["name"])
        return ret
Example #4
0
 def _eval_one(self, gold_inst: Sent, pred_inst: Sent):
     conf: DparEvalConf = self.conf
     # assert gold_inst.id == pred_inst.id, "Err: SentID mismatch!"
     # assert gold_inst.seq_word.vals == pred_inst.seq_word.vals, "Err: sent text mismatch!"
     # --
     gold_tokens = gold_inst.get_tokens()
     pred_tokens = pred_inst.get_tokens()
     assert len(gold_tokens) == len(pred_tokens)
     if conf.exclude_punct:
         res = DparEvalResult(conf, [
             (a, b)
             for a, b in zip(gold_tokens, pred_tokens) if a.upos != "PUNCT"
         ])
     else:
         res = DparEvalResult(conf,
                              [(a, b)
                               for a, b in zip(gold_tokens, pred_tokens)])
     return res
Example #5
0
 def _new_frame(self, s: Sent, one_widx: int, one_wlen: int, one_lab: int, one_score: float, vocab=None):
     if vocab is None:
         vocab = self.vocab
     # --
     f_type = vocab.idx2word(one_lab)
     f = s.make_frame(one_widx, one_wlen, self.conf.ftag, type=f_type, score=one_score)
     f.set_label_idx(one_lab)
     self.core_span_setter(f.mention, one_widx, one_wlen)  # core_span
     return f
Example #6
0
 def put_doc(self, orig_doc: Doc, nlp_doc):
     assert orig_doc.get_text() == nlp_doc.text, "Error: Input & Output text not match!"
     orig_doc.clear_sents()  # clean sents if there are originally
     # process nlp_doc
     sent_positions = []
     new_nlp_sents = nlp_doc.sentences
     for nlp_sent in new_nlp_sents:
         if len(nlp_sent.tokens) == 0:
             continue  # ignore empty ones
         sent_start_char, sent_end_char = nlp_sent.tokens[0].start_char, nlp_sent.tokens[-1].end_char
         new_s = Sent.create(text=nlp_doc.text[sent_start_char:sent_end_char])
         self.put_sent(new_s, nlp_sent)  # annotate sents
         orig_doc.add_sent(new_s)  # add sent
         sent_positions.append((sent_start_char, sent_end_char-sent_start_char))
     orig_doc.build_sent_positions(sent_positions)  # put positions
Example #7
0
def main(args):
    # conf
    conf: ZOverallConf = init_everything(ZOverallConf(), args)
    # task
    t_center = TaskCenter(conf.tconf)
    # data
    d_center = DataCenter(conf.dconf,
                          specified_wset=[])  # nothing to load here!
    # load vocab
    t_center.load_vocabs(t_center.conf.vocab_load_dir)
    # prepare datasets
    t_center.prepare_datasets(d_center.get_datasets())
    # build model
    model = ZModel(conf.mconf)
    t_center.build_mods(model)
    model.finish_sr()  # note: build sr before possible loading in testing!!
    # run
    r_center = RunCenter(conf.rconf, model, t_center, d_center)
    if conf.rconf.model_load_name != "":
        r_center.load(conf.rconf.model_load_name)
    else:
        zwarn("No model to load, Debugging mode??")
    # =====
    cc = Counter()
    BATCH_LINE = os.environ.get('ZMSP_BATCH_LINE',
                                1000)  # 1000 sents once time
    test_dataset = ZDataset(d_center.conf.testM,
                            'testM',
                            'decode',
                            _no_load=True)  # use testM for other options!
    for lines in yield_lines(sys.stdin, BATCH_LINE):
        insts = [Sent.create(one.split())
                 for one in lines]  # note: simply split as sentence!!
        test_dataset.set_insts(insts)  # directly set it!
        cc["sent"] += len(insts)
        if cc["sent"] % 50000 == 0:
            zlog(f"Decode for {cc}")
        # --
        t_center.prepare_datasets([test_dataset])  # re-prepare!!
        for ibatch in test_dataset.yield_batches(loop=False):
            one_res = model.predict_on_batch(ibatch)
        # --
        for inst in insts:
            sys.stdout.write(
                json.dumps(inst.to_json(), ensure_ascii=False) + "\n")
    # =====
    zlog(f"The end of Decoding: {cc}")
Example #8
0
 def span2feat(self, sent: Sent, widx: int,
               wlen: int):  # from a span to feat
     # def span2feat(self, sent: Sent, widx: int, wlen: int, try_head=True):  # from a span to feat
     conf: LexConstrainerConf = self.conf
     hwidx = self.hf.find_shead(sent, widx, wlen)  # try to find head word
     if conf.use_fn_style:
         hpos = sent.seq_upos.vals[hwidx]
         lu_name = " ".join(sent.seq_lemma.vals[widx:widx+wlen]).lower() + "." \
                   + UD2FN_POS_MAP.get(hpos, hpos.lower())
         feat = self.lu2feat(lu_name)
     else:
         tokens = sent.get_tokens(widx, widx + wlen)
         feat = " ".join([self.lex_feat_f(t)
                          for t in tokens])  # my own feat!
     # special try_head if not found
     # if try_head and wlen>0 and feat not in self.cmap:
     #     return self.span2feat(sent, hwidx, 1, False)
     return feat
Example #9
0
 def from_obj(self, s: str) -> DataInstance:
     conf: ConllFormatorConf = self.conf
     # --
     lines = s.rstrip().split("\n")
     all_fields = [line.split(conf.sep_in) for line in lines]
     num_col = 0
     if len(all_fields) > 0:
         num_col = len(all_fields[0])
         # assert all(len(z)<=num_col for z in all_fields)
         for z in all_fields:
             if len(z) != num_col:
                 zwarn(f"Line length not match ({len(z)} vs {num_col})")
     # --
     sent = Sent.create()  # make an empty one!!
     # -> read in conll fields
     # doc id
     if conf.f_doc is not None:
         f_doc = int(conf.f_doc)
         doc_id = ConllHelper.get_f_doc([z[f_doc] for z in all_fields])
         sent.info["doc_id"] = doc_id  # temporaly put it here!
     # part id
     if conf.f_part is not None:
         f_part = int(conf.f_part)
         part_id = ConllHelper.get_f_doc([z[f_part] for z in all_fields])
         sent.info["part_id"] = part_id
     # word idx
     if conf.f_widx is not None:
         f_widx = int(conf.f_widx)
         valids = ConllHelper.get_f_widx([z[f_widx] for z in all_fields],
                                         conf.widx_start)
         # note: filtering lines!!
         all_fields = [z for z, v in zip(all_fields, valids) if v]
     # words
     if conf.f_word is not None:
         f_word = int(conf.f_word)
         words = [z[f_word] for z in all_fields]
         sent.build_words(words)
     # pred + predid + args
     if conf.f_pred is not None:  # frames
         f_pred = int(conf.f_pred)
         f_pred_id = int(conf.f_pred_id)
         preds = ConllHelper.get_preds([z[f_pred] for z in all_fields],
                                       [z[f_pred_id] for z in all_fields],
                                       conf.combine_lemma_id,
                                       nil_vals=conf.pred_nil_vals)
         new_frames = [
             sent.make_event(p_widx, 1, type=p_lab)
             for p_widx, p_lab in preds
         ]  # note: wlen==1
         # args?
         if conf.f_arg_start is not None:
             f_arg_start = int(conf.f_arg_start)
             # --
             if num_col - conf.num_extra_field - f_arg_start != len(
                     new_frames):
                 zwarn(
                     f"Unequal num of args: {num_col - conf.num_extra_field - f_arg_start} vs {len(new_frames)}"
                 )
             # --
             for one_new_frame in new_frames:
                 # read args
                 _get_f = ConllHelper.get_f_args_dep if conf.arg_is_dep else ConllHelper.get_f_args
                 args = _get_f(one_new_frame.mention.widx,
                               [z[f_arg_start] for z in all_fields])
                 for a_widx, a_wlen, a_lab in args:
                     new_ef = sent.make_entity_filler(a_widx,
                                                      a_wlen,
                                                      type="UNK")
                     one_new_frame.add_arg(new_ef, a_lab)
                 # add one field further
                 f_arg_start += 1
     # todo(+W): currently putting others at info
     for info_name, f_field in zip(
         ["xpos", "parse", "sense", "speaker", "ne", "coref"], [
             conf.f_xpos, conf.f_parse, conf.f_sense, conf.f_speaker,
             conf.f_ne, conf.f_coref
         ]):
         if f_field is not None:
             _tmp_idx = int(f_field)
             _tmp_items = [z[_tmp_idx] for z in all_fields]
             sent.info[info_name] = _tmp_items
     # --
     # finally UD related fields
     # upos
     if conf.f_upos is not None:
         f_upos = int(conf.f_upos)
         upos = [z[f_upos] for z in all_fields]
         sent.build_uposes(upos)
     # dep
     if conf.f_dep_head is not None:
         f_dep_head = int(conf.f_dep_head)
         dep_head = [int(z[f_dep_head]) for z in all_fields]
         if conf.f_dep_label is not None:
             f_dep_label = int(conf.f_dep_label)
             dep_label = [z[f_dep_label] for z in all_fields]
         else:
             dep_label = None
         sent.build_dep_tree(dep_head, dep_label)
     # --
     # other info
     for f_idx in conf.f_others:
         f_idx = int(f_idx)
         sent.info[f_idx] = [z[f_idx]
                             for z in all_fields]  # simply put it at info!
     # --
     return sent
Example #10
0
def _approx_prev_next(insts: List):
    if len(insts) > 0 and isinstance(insts[0], Sent):
        for ii in range(len(insts) - 1):
            Sent.assign_prev_next(insts[ii], insts[ii + 1])
    return insts
Example #11
0
 def from_obj(self, s: str) -> Doc:
     d = json.loads(s)
     doc = Doc.create(id=d["doc_id"])
     doc.info.update({k: d.get(k) for k in ZDocDataFormator._OTHER_DOC_FIELDS})
     # add sents
     for one_sent in d["sents"]:
         sent = Sent.create(one_sent["text"], id=one_sent.get("id"))
         if "positions" in one_sent:
             sent.build_word_positions(one_sent["positions"])
         if "lemma" in one_sent:
             sent.build_lemmas(one_sent["lemma"])
         if "upos" in one_sent:
             sent.build_uposes(one_sent["upos"])
         if "governor" in one_sent and "dependency_relation" in one_sent:
             sent.build_dep_tree(one_sent["governor"], one_sent["dependency_relation"])
         doc.add_sent(sent)
     # --
     failed_items = {"ef": [], "evt": [], "arg": []}
     args_maps = {}  # id -> Frame
     # entities and fillers
     if d.get("entity_mentions") is None and d.get("fillers") is None:
         # no entities info
         for sent in doc.sents:
             sent.mark_no_entity_fillers()
     else:
         ef_items = d.get("entity_mentions", []) + d.get("fillers", [])
         for one_ef_item in ef_items:
             mention = self._parse_mention(one_ef_item, doc)
             if mention is None:
                 failed_items["ef"].append(one_ef_item)
             else:
                 ef = Frame.create(mention, type=one_ef_item["type"], score=one_ef_item.get("score", 0.), id=one_ef_item["id"])
                 ef.info.update({k: one_ef_item[k] for k in ["extra_info", "gid"] if k in one_ef_item})
                 # todo(note): no checking for possibly repeat efs
                 assert ef.id not in args_maps
                 args_maps[ef.id] = ef
                 mention.sent.add_entity_filler(ef)
     # events
     if d.get("event_mentions") is None:
         # no events info
         for sent in doc.sents:
             sent.mark_no_events()
     else:
         for one_evt_item in d["event_mentions"]:
             mention = self._parse_mention(one_evt_item["trigger"], doc)
             if mention is None:
                 failed_items["evt"].append(one_evt_item)
             else:
                 evt = Frame.create(mention, type=one_evt_item["type"], score=one_evt_item.get("score", 0.), id=one_evt_item["id"])
                 evt.info.update({k: one_evt_item[k] for k in ["extra_info", "gid", "realis", "realis_score"] if k in one_evt_item})
                 assert evt.id not in args_maps
                 args_maps[evt.id] = evt
                 mention.sent.add_event(evt)
     # args
     for one_evt_item in d.get("event_mentions", []):
         if one_evt_item["id"] not in args_maps:
             assert one_evt_item["trigger"]["posi"] is None
             continue
         evt = args_maps[one_evt_item["id"]]  # must be there
         em_args = one_evt_item.get("em_arg", None)
         if em_args is None:
             evt.mark_no_args()
         else:
             for one_arg in em_args:
                 aid, role = one_arg["aid"], one_arg["role"]
                 if aid not in args_maps:
                     failed_items["arg"].append(one_arg)
                 else:
                     arg_arg = args_maps[aid]
                     arglink = evt.add_arg(arg_arg, role, score=one_arg.get("score", 0.))
                     arglink.info.update({k: one_arg[k] for k in ["is_aug", "extra_info"] if k in one_arg})
     # --
     if any(len(v)>0 for k,v in failed_items.items()):
         zwarn(f"Failed when reading Doc({doc.id}): {[(k,len(v)) for k,v in failed_items.items()]}")
     return doc
Example #12
0
 def _get_frames(self, s: Sent):
     return s.get_frames(self.conf.ftag)
Example #13
0
 def from_obj(self, s: str):
     ret = Sent.create(text=s)
     if self.do_tok_sep:  # simple split
         words = s.split(self.tok_sep)
         ret.build_words(words)
     return ret
Example #14
0
 def _align_sents(self, sent, cand, align_res):
     _dels = self.delete_char_set
     matched_pairs0 = [(a, b) for a, b in zip(align_res[0], align_res[1])
                       if (a is not None and b is not None)]
     matched_pairs = self._delete_single_match(matched_pairs0)
     # --
     map1to2, map2to1 = {}, {}  # word idx maps
     words1, words2 = sent.seq_word.vals, cand.seq_word.vals
     tree1, tree2 = sent.tree_dep, cand.tree_dep
     lp1, lp2 = -1, -1
     for p1, p2 in matched_pairs + [(len(align_res[2]), len(align_res[3]))
                                    ]:  # aligned at end
         # check the mismatched one
         idxes1, idxes2 = list(range(lp1 + 1, p1)), list(range(lp2 + 1, p2))
         if len(idxes1) > 0 or len(idxes2) > 0:
             _toks1 = [
                 "".join([c for c in words1[z] if c not in _dels])
                 for z in idxes1
             ]
             _toks2 = [
                 "".join([c for c in words2[z] if c not in _dels])
                 for z in idxes2
             ]
             if ''.join(_toks1) != ''.join(_toks2):
                 zwarn(f"Piece mismatched: {_toks1} vs {_toks2}")
                 # breakpoint()
             # sub align
             _subaligns = self._sub_align_toks(_toks1, _toks2)
             for _iis1, _iis2 in _subaligns:
                 _cur_idxes1, _cur_idxes2 = [idxes1[z] for z in _iis1
                                             ], [idxes2[z] for z in _iis2]
                 if len(_cur_idxes1) > 0 and len(
                         _cur_idxes2
                 ) > 0:  # only possible to align if both have words
                     h1, h2 = self._get_head(_cur_idxes1,
                                             tree1), self._get_head(
                                                 _cur_idxes2, tree2)
                     assert h1 not in map1to2 and h2 not in map2to1
                     map1to2[h1] = h2
                     # note: specifically map more from 2 to 1
                     for _hh in _cur_idxes2:
                         if h2 in tree2.get_spine(_hh):
                             map2to1[_hh] = h1
         # add the matched one
         assert p1 not in map1to2 and p2 not in map2to1
         map1to2[p1] = p2
         map2to1[p2] = p1
         # next
         lp1, lp2 = p1, p2
     # --
     # assign deps
     _backoff_labmap = MyIndexer2._BACKOFF_LABMAP
     _res_heads, _res_labs, _res_poses = [], [], []
     for i1 in range(len(words1)):
         mapped_i2 = map1to2.get(i1)
         # upos
         if mapped_i2 is None:
             _res_poses.append("X")  # todo(+N): simply put an "X" here!
         else:
             _res_poses.append(cand.seq_upos.vals[mapped_i2])
         # dep
         mapped_i2_hidx = tree2.seq_head.vals[
             mapped_i2] - 1 if mapped_i2 is not None else None
         back_i1_hidx = -1 if mapped_i2_hidx == -1 else map2to1.get(
             mapped_i2_hidx)
         # --
         if back_i1_hidx is None:  # no map: directly put original ones!
             _res_heads.append(tree1.seq_head.vals[i1])
             _old_lab = tree1.seq_label.vals[i1]
             if _old_lab not in _backoff_labmap:
                 zwarn(f"Unknown old label: {_old_lab}")
                 # breakpoint()
             _res_labs.append(_backoff_labmap.get(
                 _old_lab, "dep"))  # by default "dep"
         else:
             _res_heads.append(back_i1_hidx + 1)  # note: remember +1
             _res_labs.append(tree2.seq_label.vals[mapped_i2])
     # --
     # get a new sent!
     res = Sent.create(words1)
     res.build_uposes(_res_poses)
     res.build_dep_tree(_res_heads, _res_labs)
     self._check_no_cycle(_res_heads)
     return res
Example #15
0
 def feat_toks(self, s: Sent):
     sent_toks = s.get_tokens()
     sent_tok_feats = [self.feat_tok_f(t) for t in sent_toks]
     return sent_toks, sent_tok_feats
Example #16
0
 def put_sent(self, orig_sent: Sent, nlp_sent):
     text = orig_sent.get_text()
     # here we process the words!
     list_words = []
     list_uposes = []
     list_lemmas = []
     list_dep_heads = []
     list_dep_labels = []
     list_word_positions = []
     cur_word_start = 0
     # find them!!
     for w in nlp_sent.words:
         list_words.append(w.text)
         list_uposes.append(w.upos)
         list_lemmas.append(w.lemma)
         list_dep_heads.append(w.head)
         list_dep_labels.append(w.deprel)
         try:
             # todo(+N): some words can map to the same token if using MWT!
             t = w.parent
             tok_start = text.index(t.text, cur_word_start)  # idx inside the sentence
             list_word_positions.append((tok_start, t.end_char-t.start_char))  # [widx, wlen]
             cur_word_start = sum(list_word_positions[-1])  # start with next one
         except:
             list_word_positions = None
     # add them
     orig_sent.build_words(list_words)
     if self.pred_upos:
         orig_sent.build_uposes(list_uposes)
     if self.pred_lemma:
         orig_sent.build_lemmas(list_lemmas)
     if self.pred_dep:
         orig_sent.build_dep_tree(list_dep_heads, list_dep_labels)
     if list_word_positions is not None:
         orig_sent.build_word_positions(list_word_positions)
Example #17
0
 def to_obj(self, inst: Sent) -> str:
     if self.do_tok_sep:
         sep = " " if self.tok_sep is None else self.tok_sep
         return sep.join(inst.seq_word.vals)
     else:
         return inst.get_text()