Ejemplo n.º 1
0
def main(args):
    conf = MainConf()
    conf.update_from_args(args)
    stat = Counter()
    if conf.domain_f in DOMAIN_FS:
        domain_f = DOMAIN_FS[conf.domain_f]
    else:
        domain_f = eval(conf.domain_f)
    # --
    all_insts = {}  # ID->List[inst]
    with zopen(conf.input) as fin:
        for line in fin:
            inst = json.loads(line)
            domain = domain_f(inst)
            if domain not in all_insts:
                all_insts[domain] = []
            all_insts[domain].append(inst)
            stat["inst"] += 1
            stat[f"inst_{domain}"] += 1
    # --
    # write
    input_name = os.path.basename(conf.input)
    for domain, insts in all_insts.items():
        output_name_fields = input_name.split(".")
        output_name_fields.insert(conf.output_insert_place, domain)
        output_name = os.path.join(conf.output_dir,
                                   ".".join(output_name_fields))
        zlog(f"Write to {output_name} {len(insts)}")
        with zopen(output_name, 'w') as fout:
            default_json_serializer.save_iter(insts, fout)
    # --
    zlog(f"Read from {fin}, stat=\n{OtherHelper.printd_str(stat)}")
Ejemplo n.º 2
0
def main(args):
    conf = MainConf()
    conf.update_from_args(args)
    # input
    with zopen(conf.input) as fd:
        lines = list(fd)
        if conf.skip_blank:
            lines = [z for z in lines if str.isspace(z)]
    # shuffle?
    origin_len = len(lines)
    if conf.shuffle_times > 0 or conf.shuffle:
        _t = max(1, conf.shuffle_times)  # at least once!
        _gen = Random.get_generator('')
        for _ in range(_t):
            _gen.shuffle(lines)
    # sample?
    final_size = int(0.999 + (conf.rate *
                              origin_len if conf.rate <= 1. else conf.rate))
    out_lines = lines[:final_size]
    # output
    if conf.output:
        with zopen(conf.output, 'w') as fd2:
            for line in out_lines:
                fd2.write(line)
    # --
    zlog(
        f"Sample({conf.rate}) {conf.input}=>{conf.output}: {origin_len}=>{len(out_lines)}"
    )
Ejemplo n.º 3
0
def main(args):
    conf: MainConf = init_everything(MainConf(), args, add_nn=False)
    zlog(f"** Run with {conf.to_json()}")
    # --
    if conf.input_table_file:
        with zopen(conf.input_table_file) as fd:
            s = fd.read()
            table = eval(s)
            mm = table[conf.name]  # read the table from file
    else:
        mm = globals()[conf.name]
    # --
    workers = Worker.get_gpu_workers([int(z) for z in conf.gpus],
                                     ncore=int(conf.ncore))
    # --
    x = TuneDriver(conf.tune_conf, workers, conf.name, extra_info=mm)
    task_iter = [
        MyTask(gid, conf.name, sel_idxes,
               [a[i] for a, i in zip(mm, sel_idxes)], conf.task_conf)
        for gid, sel_idxes in enumerate(
            iter_arg_choices(mm,
                             repeat=conf.repeat,
                             shuffle=conf.shuffle,
                             max_num=conf.max_count))
    ]
    x.main(task_iter)
Ejemplo n.º 4
0
 def __init__(self, fd_or_path: Union[IO, str], end='\n'):
     super().__init__()
     self.fd_or_path = fd_or_path
     self.end = end
     if isinstance(fd_or_path, str):
         self.should_close = True
         self.fd = zopen(fd_or_path, 'w')
     else:
         self.should_close = False
         self.fd = fd_or_path
Ejemplo n.º 5
0
 def eval(self, gold_insts: List, pred_insts: List):
     # --
     conf: PbEvalConf = self.conf
     tmp_gold, tmp_pred = conf.tmp_file_prefix + ".gold.props", conf.tmp_file_prefix + ".pred.props"
     with zopen(tmp_gold, 'w') as fd:
         fd.write(insts2props(gold_insts))
     with zopen(tmp_pred, 'w') as fd:
         fd.write(insts2props(pred_insts))
     # --
     precision_output = system(
         f"{self.cmd} {tmp_pred} {tmp_gold} 2>/dev/null",
         ass=True,
         popen=True)
     recall_output = system(f"{self.cmd} {tmp_gold} {tmp_pred} 2>/dev/null",
                            ass=True,
                            popen=True)
     # parse output
     res = PbEvalResult(precision_output, recall_output)
     return res
Ejemplo n.º 6
0
 def save_txt(fname: str, words: List[str], vecs: List, sep: str):
     num_words = len(words)
     embed_size = len(vecs[0])
     zlog(
         f"Saving w2v (in txt) num_words={num_words}, embed_size={embed_size} to {fname}."
     )
     assert num_words == len(vecs), "Unmatched size!"
     with zopen(fname, "w") as fd:
         fd.write(f"{num_words}{sep}{embed_size}\n")
         for w, vec in zip(words, vecs):
             assert len(vec) == embed_size, "Unmatched dim!"
             print_list = [w] + ["%.6f" % float(z) for z in vec]
             fd.write(sep.join(print_list) + "\n")
Ejemplo n.º 7
0
def main(args):
    conf = MainConf()
    conf.update_from_args(args)
    stat = Counter()
    # --
    with zopen(conf.input) as fin, zopen(conf.output, 'w') as fout:
        cur_lines = []
        for line in fin:
            line = line.rstrip()
            if line.lstrip().startswith("#"): continue  # ignore comments!
            if len(line) == 0:
                if len(cur_lines) > 0:
                    lines2 = process_one(cur_lines, conf, stat)
                    fout.write("".join([z + '\n' for z in lines2]) + "\n")
                cur_lines.clear()
            else:
                cur_lines.append(line)
        if len(cur_lines) > 0:
            lines2 = process_one(cur_lines, conf, stat)
            fout.write("".join([z + '\n' for z in lines2]) + "\n")
    # --
    zlog(
        f"Read from {fin}, write to {fout}, stat=\n{OtherHelper.printd_str(stat)}"
    )
Ejemplo n.º 8
0
 def annotate(self, insts: List):
     conf: AnnotatorP2DConf = self.conf
     # --
     # get all sentences and run in batch
     all_sents = list(yield_sents(insts))
     tmp_input = os.path.join(conf.p2d_tmp_dir, "_input.penn")
     with zopen(tmp_input, 'w') as fd:
         for sent in all_sents:
             fd.write(sent2tree(sent) + "\n")
     # run
     tmp_output = os.path.join(conf.p2d_tmp_dir, "_output.conllu")
     log_cmd = f'2>{conf.p2d_log}' if conf.p2d_log else ''
     system(f"{self.cmd} -treeFile {tmp_input} >{tmp_output} {log_cmd}")
     # read output and add back
     conll_reader_conf = ReaderGetterConf()
     conll_reader_conf.input_conf.use_multiline = True
     conll_reader_conf.input_conf.mtl_ignore_f = "'ignore_#'"
     conll_reader_conf.input_format = "conllu"
     conll_reader_conf.input_path = tmp_output
     conll_reader = get_reader(conll_reader_conf)
     new_sents = list(conll_reader)
     # --
     assert len(all_sents) == len(new_sents)
     for s0, s1 in zip(all_sents, new_sents):
         assert len(s0) == len(s1)
         mismatched_tokens = [
             (v1, v2) for v1, v2 in zip(s0.seq_word.vals, s1.seq_word.vals)
             if v1 != v2
         ]
         if len(mismatched_tokens) > 0:
             zwarn(
                 f"Mismatch token NUM={len(mismatched_tokens)}: {mismatched_tokens}"
             )
             if conf.p2d_change_words:
                 s0.build_words(s1.seq_word.vals)  # use the other one!!
             # breakpoint()
         # note: build again!
         s0.build_dep_tree(s1.tree_dep.seq_head.vals, [
             self.p2d_udp_converter(z) for z in s1.tree_dep.seq_label.vals
         ])
         if conf.p2d_use_xpos:
             trg_pos_list = s1.info.get["xpos"]
         else:
             trg_pos_list = s1.seq_upos.vals
         s0.build_uposes([self.p2d_upos_converter(z) for z in trg_pos_list])
Ejemplo n.º 9
0
 def _load_txt(fname: str, sep=" "):
     zlog(f"Going to load pre-trained (txt) w2v from {fname} ...")
     repeated_count = 0
     words, vecs = [], []
     word_set = set()
     num_words, embed_size = None, None
     with zopen(fname) as fd:
         # first line
         line = fd.readline()
         try:
             num_words, embed_size = [int(x) for x in line.split(sep)]
             zlog(
                 f"Reading w2v num_words={num_words}, embed_size={embed_size}."
             )
             line = fd.readline()
         except:
             zlog("Reading w2v.")
         # the rest
         while len(line) > 0:
             fields = line.rstrip().split(sep)
             word, vec = fields[0], [float(x) for x in fields[1:]]
             if word in word_set:
                 repeated_count += 1
                 zwarn(f"Repeat key {word}")
             else:  # only add the first one
                 words.append(word)
                 vecs.append(vec)
                 word_set.add(word)
             # put embed_size
             if embed_size is None:
                 embed_size = len(vec)
             else:
                 assert len(vec) == embed_size, "Unmatched embed dimension."
             line = fd.readline()
     if num_words is not None:
         assert num_words == len(vecs) + repeated_count
     num_words = len(vecs)
     # final
     zlog(
         f"Read ok: w2v num_words={num_words}, embed_size={embed_size}, repeat={repeated_count}"
     )
     return WordVectors(words, vecs)
Ejemplo n.º 10
0
def main(*args):
    conf = MainConf()
    conf.update_from_args(args)
    # --
    if conf.load_pkl:
        collection = default_pickle_serializer.from_file(conf.load_pkl)
    else:
        reader = FrameReader()
        collection = reader.read_all(conf.dir, conf.onto)
    if conf.save_pkl:
        default_pickle_serializer.to_file(collection, conf.save_pkl)
    if conf.save_txt:
        with zopen(conf.save_txt, 'w') as fd:
            for f in collection.frames:
                fd.write("#--\n" + f.to_string() + "\n")
    # --
    if conf.debug:
        breakpoint()
    if conf.query:
        map_frame = {f.name: f for f in collection.frames}
        map_lu = ZFrameCollectionHelper.build_lu_map(collection, split_lu={"pb":"_", "fn":None}[conf.onto])
        map_role = ZFrameCollectionHelper.build_role_map(collection)
        while True:
            line = input(">> ")
            fields = sh_split(line.strip())
            if len(fields) == 0:
                continue
            try:
                query0, query1 = fields
                _map = {'frame': map_frame, 'lu': map_lu, 'role': map_role}[query0]
                answer = _map.get(query1, None)
                if isinstance(answer, ZFrame):
                    zlog(answer.to_string())
                else:
                    zlog(answer)
            except:
                zlog(f"Wrong cmd: {fields}")
                pass
Ejemplo n.º 11
0
def main(evaluator: str, *args):
    # find evaluator
    conf = MainConf()
    e_res = Evaluator.try_load_and_lookup(evaluator)
    one_conf, one_type = e_res.conf, e_res.T
    conf.econf = one_conf()
    # --
    conf = init_everything(conf, args)
    zlog(f"Ready to evaluate with {evaluator}: {conf.gold} {conf.pred}")
    # --
    gold_insts = list(conf.gold.get_reader())
    pred_insts = list(conf.pred.get_reader())
    evaler: Evaluator = one_type(conf.econf)
    res = evaler.eval(gold_insts, pred_insts)
    if conf.result_file:
        with zopen(conf.result_file,
                   'a') as fd:  # note: here we use append mode
            fd.write(
                f"# Eval with {args}:\n{res.get_brief_str()}\n{res.get_detailed_str()}\n"
            )
    zlog(f"Eval on {conf.gold} vs. {conf.pred}; RESULT = {res}")
    if conf.print_details:
        zlog(f"#-- details:\n{res.get_detailed_str()}")
Ejemplo n.º 12
0
 def annotate(self, insts: List[DataInstance]):
     conf: AnnotatorSemaforConf = self.conf
     # --
     # get all sentences and run in batch
     all_sents = list(yield_sents(insts))
     # run all in batch
     # step 1: prepare input
     tmp_input = os.path.join(f"{conf.semafor_tmp_dir}", "_input.txt")
     tmp_input = os.path.abspath(tmp_input)  # require absolute path
     tmp_input = self.delete_and_get_file(tmp_input)
     with zopen(tmp_input, 'w') as fd:
         for sent in all_sents:  # write one line per sent
             fd.write(" ".join(sent.seq_word.vals) + "\n")
     # step 2: run semafor
     tmp_output = os.path.join(f"{conf.semafor_tmp_dir}", "_output.json")
     tmp_output = os.path.abspath(tmp_output)  # require absolute path
     tmp_output = self.delete_and_get_file(
         tmp_output, delete=(not conf.semafor_use_cached))
     if not conf.semafor_use_cached:  # otherwise simply skip running
         _semafor_log = conf.semafor_log if conf.semafor_log else "/dev/null"  # append to log!
         system(
             f"bash {self.semafor_sh} {tmp_input} {tmp_output} {conf.semafor_num_threads} >>{_semafor_log} 2>&1",
             ass=True)
     # step 3: read output and put them in sents
     semafor_results = default_json_serializer.load_list(tmp_output)
     assert len(semafor_results) == len(
         all_sents), "Error: predict inst number mismatch!"
     for one_res, one_sent in zip(semafor_results, all_sents):
         one_semafor_sent: Sent = SemaforHelper.semafor2sent(one_res)
         one_idx_map = SemaforHelper.find_sent_map(one_semafor_sent,
                                                   one_sent)
         # put them back
         one_sent.clear_events()
         one_sent.clear_entity_fillers()
         # add them all
         for evt in one_semafor_sent.events:
             evt_widx, evt_wlen = evt.mention.widx, evt.mention.wlen
             mapped_posi = SemaforHelper.map_span(evt_widx, evt_wlen,
                                                  one_idx_map)
             if mapped_posi is None:
                 zwarn(
                     f"Failed mapping evt of {evt}: {evt.mention} to {one_sent.seq_word}"
                 )
                 continue
             evt2 = one_sent.make_event(mapped_posi[0],
                                        mapped_posi[1],
                                        type=evt.type)
             for alink in evt.args:
                 ef = alink.arg
                 ef_widx, ef_wlen = ef.mention.widx, ef.mention.wlen
                 mapped_posi = SemaforHelper.map_span(
                     ef_widx, ef_wlen, one_idx_map)
                 if mapped_posi is None:
                     zwarn(
                         f"Failed mapping arg of {alink}: {ef.mention} to {one_sent.seq_word}"
                     )
                     continue
                 ef2 = one_sent.make_entity_filler(
                     mapped_posi[0],
                     mapped_posi[1])  # make new ef for each arg
                 evt2.add_arg(ef2, role=alink.role)
     # --
     self.count += 1
Ejemplo n.º 13
0
 def dump_one(self, obj: object):
     new_file = os.path.join(self.path,
                             self.dir_file_f(obj) + self.dir_file_suffix)
     with zopen(new_file, 'w') as fd:
         fd.write(f"{obj}{self.end}")
Ejemplo n.º 14
0
def main(tconf: MyTaskConf, args):
    conf = MainConf()
    conf.task_conf = tconf
    conf: MainConf = init_everything(conf, args, add_nn=False)
    zlog(f"** Run with {conf.to_json()}")
    # --
    if conf.tune_table_file:
        with zopen(conf.tune_table_file) as fd:
            s = fd.read()
            table = eval(s)
            mm = table.get(conf.tune_name)  # read the table from file
    else:
        mm = globals().get(conf.tune_name)
    # --
    if mm is not None:  # if we can read it!
        # note: if tuning, we want it to be quiet
        conf.task_conf.quite = True
        # --
        workers = te.Worker.get_gpu_workers([int(z) for z in conf.gpus],
                                            ncore=int(conf.ncore))
        x = te.TuneDriver(conf.tune_conf,
                          workers,
                          conf.tune_name,
                          extra_info=mm)
        task_iter = []
        # --
        all_runs = enumerate(
            te.iter_arg_choices(mm,
                                repeat=conf.repeat,
                                shuffle=conf.shuffle,
                                max_num=conf.max_count))
        if not conf.repeat:
            all_runs = list(all_runs)
            orig_all_runs = list(all_runs)
        else:
            orig_all_runs = None
        if len(conf.task_sels) > 0:
            assert not conf.repeat and not conf.shuffle
            _sels = [int(z) for z in conf.task_sels]
            all_runs = list(all_runs)
            zlog(f"Select {len(_sels)}/{len(all_runs)}: {_sels}")
            all_runs = [all_runs[z] for z in _sels]
        # --
        if not conf.repeat:
            _max_gid = 0 if len(orig_all_runs) == 0 else max(
                z[0] for z in orig_all_runs)
            _padn = len(str(_max_gid))
            _pads = f"%0{_padn}d"
        else:
            _pads = "%d"
        # --
        for gid, sel_idxes in all_runs:
            # note: override run_dir!!
            s_gid = _pads % gid
            one = conf.task_conf.make_task(
                run_dir=f"run_{conf.tune_name}_{s_gid}",
                _id_str=f"{s_gid}:{sel_idxes}",
                _train_extras=" ".join([a[i] for a, i in zip(mm, sel_idxes)]))
            task_iter.append(one)
        x.main(task_iter)
    else:  # otherwise single run
        assert not conf.tune_name
        task = conf.task_conf.make_task()  # no setting here!!
        task.execute(
            f"CUDA_VISIBLE_DEVICES={','.join(conf.gpus)} OMP_NUM_THREADS={conf.ncore} MKL_NUM_THREADS={conf.ncore}"
        )
Ejemplo n.º 15
0
def main(res_file="res.json", known_set="", *extra_names):
    # --
    with zopen(res_file) as fd:
        res = json.load(fd)
    # --
    def _get_data(_name: str):
        _keys = [k for k in res if _name in k]
        assert len(_keys) == 1
        return res[_keys[0]]

    # --
    def _show_entry(_d: dict):
        return MyCounter(_d).summary_str(30, 130)[9:]

    # -----
    all_datasets = OrderedDict([
        ("conll05",
         [f"conll05/{z}" for z in ["train", "dev", "test.wsj", "test.brown"]]),
        ("conll12", [f"conll12b/{z}" for z in ["train", "dev", "test"]] +
         ["conll12/train", "pb/ontonotes.train"]),
        ("ewt", [f"ewt.{z}" for z in ["train", "dev", "test"]]),
        ("fn15", [f"fn15_fulltext.{z}"
                  for z in ["train", "dev", "test."]] + ["fn15_exemplars"]),
        ("fn17", [f"fn17_fulltext.{z}"
                  for z in ["train", "dev", "test."]] + ["fn17_exemplars"]),
    ])
    all_groups = OrderedDict([
        ("basic", [
            "sent", "tok", "frame", "f/s", "f/t", "arg", "a/f", "a/(f*t/s)",
            "AO", "AO1"
        ]),
        ("frame_wlen", ["frame_wlen"]),
        ("frame_trigger_pos", ["frame_trigger_pos"]),
        ("frame_type", ["frame_type"]),
        ("frame_type0", ["frame_type0"]),
        ("arg_wlen_m30", ["arg_wlen_m30"]),
        ("arg_role", ["arg_role"]),
        ("arg_repeat", ["arg_repeat"]),
        ("arg_repeatR", ["arg_repeatR"]),
    ])
    cc = OrderedDict()
    # --
    # first collect all
    all_data_names = sum(all_datasets.values(),
                         []) if not known_set else all_datasets.get(
                             known_set, [])
    all_data_names = all_data_names + list(extra_names)
    all_data = [_get_data(z) for z in all_data_names]
    # basic
    cc["sent"] = [z["sent"] for z in all_data]
    cc["tok"] = [z["tok"] for z in all_data]
    cc["frame"] = [z["frame"] for z in all_data]
    cc["f/s"] = [z["frame"] / z["sent"] for z in all_data]
    cc["f/t"] = [z["frame"] / z["tok"] for z in all_data]
    cc["arg"] = [z["arg"] for z in all_data]
    cc["a/f"] = [z["arg"] / z["frame"] for z in all_data]
    cc["a/(f*t/s)"] = [
        z["arg"] / (z["frame"] * z["tok"] / z["sent"]) for z in all_data
    ]
    cc["AO"] = [z["arg_overlapped"] / z["arg"] for z in all_data]
    cc["AO1"] = [z["arg_overlapped_R1"] / z["arg_R1"] for z in all_data]
    # others
    cc["frame_wlen"] = [_show_entry(z["frame_wlen"]) for z in all_data]
    cc["frame_trigger_pos"] = [
        _show_entry(z["frame_trigger_pos"]) for z in all_data
    ]
    cc["frame_type"] = [_show_entry(z["frame_type"]) for z in all_data]
    cc["frame_type0"] = [_show_entry(z["frame_type0"]) for z in all_data]
    cc["arg_wlen_m30"] = [_show_entry(z["arg_wlen_m30"]) for z in all_data]
    cc["arg_role"] = [_show_entry(z["arg_role"]) for z in all_data]
    cc["arg_repeat"] = [_show_entry(z["arg_repeat"]) for z in all_data]
    cc["arg_repeatR"] = [_show_entry(z["arg_repeatR"]) for z in all_data]

    # --
    def left_justified(df):
        try:
            formatters = {}
            for li in list(df.columns):
                max = df[li].str.len().max()
                form = "{{:<{}s}}".format(max)
                formatters[li] = functools.partial(str.format, form)
            return df.to_string(formatters=formatters)
        except:
            return df.to_string()

    # --
    dd = pd.DataFrame(cc, index=all_data_names)
    for group_name, group_keys in all_groups.items():
        zlog(f"#== GROUP {group_name}\n" + left_justified(dd[group_keys]))
Ejemplo n.º 16
0
 def __init__(self, file: str):
     self.file = file
     self.fd = zopen(file, 'wb')
     zlog(f"Open file for OutputHelper writing: {file}({self.fd})")