def main(args): conf = MainConf() conf.update_from_args(args) stat = Counter() if conf.domain_f in DOMAIN_FS: domain_f = DOMAIN_FS[conf.domain_f] else: domain_f = eval(conf.domain_f) # -- all_insts = {} # ID->List[inst] with zopen(conf.input) as fin: for line in fin: inst = json.loads(line) domain = domain_f(inst) if domain not in all_insts: all_insts[domain] = [] all_insts[domain].append(inst) stat["inst"] += 1 stat[f"inst_{domain}"] += 1 # -- # write input_name = os.path.basename(conf.input) for domain, insts in all_insts.items(): output_name_fields = input_name.split(".") output_name_fields.insert(conf.output_insert_place, domain) output_name = os.path.join(conf.output_dir, ".".join(output_name_fields)) zlog(f"Write to {output_name} {len(insts)}") with zopen(output_name, 'w') as fout: default_json_serializer.save_iter(insts, fout) # -- zlog(f"Read from {fin}, stat=\n{OtherHelper.printd_str(stat)}")
def main(args): conf = MainConf() conf.update_from_args(args) # input with zopen(conf.input) as fd: lines = list(fd) if conf.skip_blank: lines = [z for z in lines if str.isspace(z)] # shuffle? origin_len = len(lines) if conf.shuffle_times > 0 or conf.shuffle: _t = max(1, conf.shuffle_times) # at least once! _gen = Random.get_generator('') for _ in range(_t): _gen.shuffle(lines) # sample? final_size = int(0.999 + (conf.rate * origin_len if conf.rate <= 1. else conf.rate)) out_lines = lines[:final_size] # output if conf.output: with zopen(conf.output, 'w') as fd2: for line in out_lines: fd2.write(line) # -- zlog( f"Sample({conf.rate}) {conf.input}=>{conf.output}: {origin_len}=>{len(out_lines)}" )
def main(args): conf: MainConf = init_everything(MainConf(), args, add_nn=False) zlog(f"** Run with {conf.to_json()}") # -- if conf.input_table_file: with zopen(conf.input_table_file) as fd: s = fd.read() table = eval(s) mm = table[conf.name] # read the table from file else: mm = globals()[conf.name] # -- workers = Worker.get_gpu_workers([int(z) for z in conf.gpus], ncore=int(conf.ncore)) # -- x = TuneDriver(conf.tune_conf, workers, conf.name, extra_info=mm) task_iter = [ MyTask(gid, conf.name, sel_idxes, [a[i] for a, i in zip(mm, sel_idxes)], conf.task_conf) for gid, sel_idxes in enumerate( iter_arg_choices(mm, repeat=conf.repeat, shuffle=conf.shuffle, max_num=conf.max_count)) ] x.main(task_iter)
def __init__(self, fd_or_path: Union[IO, str], end='\n'): super().__init__() self.fd_or_path = fd_or_path self.end = end if isinstance(fd_or_path, str): self.should_close = True self.fd = zopen(fd_or_path, 'w') else: self.should_close = False self.fd = fd_or_path
def eval(self, gold_insts: List, pred_insts: List): # -- conf: PbEvalConf = self.conf tmp_gold, tmp_pred = conf.tmp_file_prefix + ".gold.props", conf.tmp_file_prefix + ".pred.props" with zopen(tmp_gold, 'w') as fd: fd.write(insts2props(gold_insts)) with zopen(tmp_pred, 'w') as fd: fd.write(insts2props(pred_insts)) # -- precision_output = system( f"{self.cmd} {tmp_pred} {tmp_gold} 2>/dev/null", ass=True, popen=True) recall_output = system(f"{self.cmd} {tmp_gold} {tmp_pred} 2>/dev/null", ass=True, popen=True) # parse output res = PbEvalResult(precision_output, recall_output) return res
def save_txt(fname: str, words: List[str], vecs: List, sep: str): num_words = len(words) embed_size = len(vecs[0]) zlog( f"Saving w2v (in txt) num_words={num_words}, embed_size={embed_size} to {fname}." ) assert num_words == len(vecs), "Unmatched size!" with zopen(fname, "w") as fd: fd.write(f"{num_words}{sep}{embed_size}\n") for w, vec in zip(words, vecs): assert len(vec) == embed_size, "Unmatched dim!" print_list = [w] + ["%.6f" % float(z) for z in vec] fd.write(sep.join(print_list) + "\n")
def main(args): conf = MainConf() conf.update_from_args(args) stat = Counter() # -- with zopen(conf.input) as fin, zopen(conf.output, 'w') as fout: cur_lines = [] for line in fin: line = line.rstrip() if line.lstrip().startswith("#"): continue # ignore comments! if len(line) == 0: if len(cur_lines) > 0: lines2 = process_one(cur_lines, conf, stat) fout.write("".join([z + '\n' for z in lines2]) + "\n") cur_lines.clear() else: cur_lines.append(line) if len(cur_lines) > 0: lines2 = process_one(cur_lines, conf, stat) fout.write("".join([z + '\n' for z in lines2]) + "\n") # -- zlog( f"Read from {fin}, write to {fout}, stat=\n{OtherHelper.printd_str(stat)}" )
def annotate(self, insts: List): conf: AnnotatorP2DConf = self.conf # -- # get all sentences and run in batch all_sents = list(yield_sents(insts)) tmp_input = os.path.join(conf.p2d_tmp_dir, "_input.penn") with zopen(tmp_input, 'w') as fd: for sent in all_sents: fd.write(sent2tree(sent) + "\n") # run tmp_output = os.path.join(conf.p2d_tmp_dir, "_output.conllu") log_cmd = f'2>{conf.p2d_log}' if conf.p2d_log else '' system(f"{self.cmd} -treeFile {tmp_input} >{tmp_output} {log_cmd}") # read output and add back conll_reader_conf = ReaderGetterConf() conll_reader_conf.input_conf.use_multiline = True conll_reader_conf.input_conf.mtl_ignore_f = "'ignore_#'" conll_reader_conf.input_format = "conllu" conll_reader_conf.input_path = tmp_output conll_reader = get_reader(conll_reader_conf) new_sents = list(conll_reader) # -- assert len(all_sents) == len(new_sents) for s0, s1 in zip(all_sents, new_sents): assert len(s0) == len(s1) mismatched_tokens = [ (v1, v2) for v1, v2 in zip(s0.seq_word.vals, s1.seq_word.vals) if v1 != v2 ] if len(mismatched_tokens) > 0: zwarn( f"Mismatch token NUM={len(mismatched_tokens)}: {mismatched_tokens}" ) if conf.p2d_change_words: s0.build_words(s1.seq_word.vals) # use the other one!! # breakpoint() # note: build again! s0.build_dep_tree(s1.tree_dep.seq_head.vals, [ self.p2d_udp_converter(z) for z in s1.tree_dep.seq_label.vals ]) if conf.p2d_use_xpos: trg_pos_list = s1.info.get["xpos"] else: trg_pos_list = s1.seq_upos.vals s0.build_uposes([self.p2d_upos_converter(z) for z in trg_pos_list])
def _load_txt(fname: str, sep=" "): zlog(f"Going to load pre-trained (txt) w2v from {fname} ...") repeated_count = 0 words, vecs = [], [] word_set = set() num_words, embed_size = None, None with zopen(fname) as fd: # first line line = fd.readline() try: num_words, embed_size = [int(x) for x in line.split(sep)] zlog( f"Reading w2v num_words={num_words}, embed_size={embed_size}." ) line = fd.readline() except: zlog("Reading w2v.") # the rest while len(line) > 0: fields = line.rstrip().split(sep) word, vec = fields[0], [float(x) for x in fields[1:]] if word in word_set: repeated_count += 1 zwarn(f"Repeat key {word}") else: # only add the first one words.append(word) vecs.append(vec) word_set.add(word) # put embed_size if embed_size is None: embed_size = len(vec) else: assert len(vec) == embed_size, "Unmatched embed dimension." line = fd.readline() if num_words is not None: assert num_words == len(vecs) + repeated_count num_words = len(vecs) # final zlog( f"Read ok: w2v num_words={num_words}, embed_size={embed_size}, repeat={repeated_count}" ) return WordVectors(words, vecs)
def main(*args): conf = MainConf() conf.update_from_args(args) # -- if conf.load_pkl: collection = default_pickle_serializer.from_file(conf.load_pkl) else: reader = FrameReader() collection = reader.read_all(conf.dir, conf.onto) if conf.save_pkl: default_pickle_serializer.to_file(collection, conf.save_pkl) if conf.save_txt: with zopen(conf.save_txt, 'w') as fd: for f in collection.frames: fd.write("#--\n" + f.to_string() + "\n") # -- if conf.debug: breakpoint() if conf.query: map_frame = {f.name: f for f in collection.frames} map_lu = ZFrameCollectionHelper.build_lu_map(collection, split_lu={"pb":"_", "fn":None}[conf.onto]) map_role = ZFrameCollectionHelper.build_role_map(collection) while True: line = input(">> ") fields = sh_split(line.strip()) if len(fields) == 0: continue try: query0, query1 = fields _map = {'frame': map_frame, 'lu': map_lu, 'role': map_role}[query0] answer = _map.get(query1, None) if isinstance(answer, ZFrame): zlog(answer.to_string()) else: zlog(answer) except: zlog(f"Wrong cmd: {fields}") pass
def main(evaluator: str, *args): # find evaluator conf = MainConf() e_res = Evaluator.try_load_and_lookup(evaluator) one_conf, one_type = e_res.conf, e_res.T conf.econf = one_conf() # -- conf = init_everything(conf, args) zlog(f"Ready to evaluate with {evaluator}: {conf.gold} {conf.pred}") # -- gold_insts = list(conf.gold.get_reader()) pred_insts = list(conf.pred.get_reader()) evaler: Evaluator = one_type(conf.econf) res = evaler.eval(gold_insts, pred_insts) if conf.result_file: with zopen(conf.result_file, 'a') as fd: # note: here we use append mode fd.write( f"# Eval with {args}:\n{res.get_brief_str()}\n{res.get_detailed_str()}\n" ) zlog(f"Eval on {conf.gold} vs. {conf.pred}; RESULT = {res}") if conf.print_details: zlog(f"#-- details:\n{res.get_detailed_str()}")
def annotate(self, insts: List[DataInstance]): conf: AnnotatorSemaforConf = self.conf # -- # get all sentences and run in batch all_sents = list(yield_sents(insts)) # run all in batch # step 1: prepare input tmp_input = os.path.join(f"{conf.semafor_tmp_dir}", "_input.txt") tmp_input = os.path.abspath(tmp_input) # require absolute path tmp_input = self.delete_and_get_file(tmp_input) with zopen(tmp_input, 'w') as fd: for sent in all_sents: # write one line per sent fd.write(" ".join(sent.seq_word.vals) + "\n") # step 2: run semafor tmp_output = os.path.join(f"{conf.semafor_tmp_dir}", "_output.json") tmp_output = os.path.abspath(tmp_output) # require absolute path tmp_output = self.delete_and_get_file( tmp_output, delete=(not conf.semafor_use_cached)) if not conf.semafor_use_cached: # otherwise simply skip running _semafor_log = conf.semafor_log if conf.semafor_log else "/dev/null" # append to log! system( f"bash {self.semafor_sh} {tmp_input} {tmp_output} {conf.semafor_num_threads} >>{_semafor_log} 2>&1", ass=True) # step 3: read output and put them in sents semafor_results = default_json_serializer.load_list(tmp_output) assert len(semafor_results) == len( all_sents), "Error: predict inst number mismatch!" for one_res, one_sent in zip(semafor_results, all_sents): one_semafor_sent: Sent = SemaforHelper.semafor2sent(one_res) one_idx_map = SemaforHelper.find_sent_map(one_semafor_sent, one_sent) # put them back one_sent.clear_events() one_sent.clear_entity_fillers() # add them all for evt in one_semafor_sent.events: evt_widx, evt_wlen = evt.mention.widx, evt.mention.wlen mapped_posi = SemaforHelper.map_span(evt_widx, evt_wlen, one_idx_map) if mapped_posi is None: zwarn( f"Failed mapping evt of {evt}: {evt.mention} to {one_sent.seq_word}" ) continue evt2 = one_sent.make_event(mapped_posi[0], mapped_posi[1], type=evt.type) for alink in evt.args: ef = alink.arg ef_widx, ef_wlen = ef.mention.widx, ef.mention.wlen mapped_posi = SemaforHelper.map_span( ef_widx, ef_wlen, one_idx_map) if mapped_posi is None: zwarn( f"Failed mapping arg of {alink}: {ef.mention} to {one_sent.seq_word}" ) continue ef2 = one_sent.make_entity_filler( mapped_posi[0], mapped_posi[1]) # make new ef for each arg evt2.add_arg(ef2, role=alink.role) # -- self.count += 1
def dump_one(self, obj: object): new_file = os.path.join(self.path, self.dir_file_f(obj) + self.dir_file_suffix) with zopen(new_file, 'w') as fd: fd.write(f"{obj}{self.end}")
def main(tconf: MyTaskConf, args): conf = MainConf() conf.task_conf = tconf conf: MainConf = init_everything(conf, args, add_nn=False) zlog(f"** Run with {conf.to_json()}") # -- if conf.tune_table_file: with zopen(conf.tune_table_file) as fd: s = fd.read() table = eval(s) mm = table.get(conf.tune_name) # read the table from file else: mm = globals().get(conf.tune_name) # -- if mm is not None: # if we can read it! # note: if tuning, we want it to be quiet conf.task_conf.quite = True # -- workers = te.Worker.get_gpu_workers([int(z) for z in conf.gpus], ncore=int(conf.ncore)) x = te.TuneDriver(conf.tune_conf, workers, conf.tune_name, extra_info=mm) task_iter = [] # -- all_runs = enumerate( te.iter_arg_choices(mm, repeat=conf.repeat, shuffle=conf.shuffle, max_num=conf.max_count)) if not conf.repeat: all_runs = list(all_runs) orig_all_runs = list(all_runs) else: orig_all_runs = None if len(conf.task_sels) > 0: assert not conf.repeat and not conf.shuffle _sels = [int(z) for z in conf.task_sels] all_runs = list(all_runs) zlog(f"Select {len(_sels)}/{len(all_runs)}: {_sels}") all_runs = [all_runs[z] for z in _sels] # -- if not conf.repeat: _max_gid = 0 if len(orig_all_runs) == 0 else max( z[0] for z in orig_all_runs) _padn = len(str(_max_gid)) _pads = f"%0{_padn}d" else: _pads = "%d" # -- for gid, sel_idxes in all_runs: # note: override run_dir!! s_gid = _pads % gid one = conf.task_conf.make_task( run_dir=f"run_{conf.tune_name}_{s_gid}", _id_str=f"{s_gid}:{sel_idxes}", _train_extras=" ".join([a[i] for a, i in zip(mm, sel_idxes)])) task_iter.append(one) x.main(task_iter) else: # otherwise single run assert not conf.tune_name task = conf.task_conf.make_task() # no setting here!! task.execute( f"CUDA_VISIBLE_DEVICES={','.join(conf.gpus)} OMP_NUM_THREADS={conf.ncore} MKL_NUM_THREADS={conf.ncore}" )
def main(res_file="res.json", known_set="", *extra_names): # -- with zopen(res_file) as fd: res = json.load(fd) # -- def _get_data(_name: str): _keys = [k for k in res if _name in k] assert len(_keys) == 1 return res[_keys[0]] # -- def _show_entry(_d: dict): return MyCounter(_d).summary_str(30, 130)[9:] # ----- all_datasets = OrderedDict([ ("conll05", [f"conll05/{z}" for z in ["train", "dev", "test.wsj", "test.brown"]]), ("conll12", [f"conll12b/{z}" for z in ["train", "dev", "test"]] + ["conll12/train", "pb/ontonotes.train"]), ("ewt", [f"ewt.{z}" for z in ["train", "dev", "test"]]), ("fn15", [f"fn15_fulltext.{z}" for z in ["train", "dev", "test."]] + ["fn15_exemplars"]), ("fn17", [f"fn17_fulltext.{z}" for z in ["train", "dev", "test."]] + ["fn17_exemplars"]), ]) all_groups = OrderedDict([ ("basic", [ "sent", "tok", "frame", "f/s", "f/t", "arg", "a/f", "a/(f*t/s)", "AO", "AO1" ]), ("frame_wlen", ["frame_wlen"]), ("frame_trigger_pos", ["frame_trigger_pos"]), ("frame_type", ["frame_type"]), ("frame_type0", ["frame_type0"]), ("arg_wlen_m30", ["arg_wlen_m30"]), ("arg_role", ["arg_role"]), ("arg_repeat", ["arg_repeat"]), ("arg_repeatR", ["arg_repeatR"]), ]) cc = OrderedDict() # -- # first collect all all_data_names = sum(all_datasets.values(), []) if not known_set else all_datasets.get( known_set, []) all_data_names = all_data_names + list(extra_names) all_data = [_get_data(z) for z in all_data_names] # basic cc["sent"] = [z["sent"] for z in all_data] cc["tok"] = [z["tok"] for z in all_data] cc["frame"] = [z["frame"] for z in all_data] cc["f/s"] = [z["frame"] / z["sent"] for z in all_data] cc["f/t"] = [z["frame"] / z["tok"] for z in all_data] cc["arg"] = [z["arg"] for z in all_data] cc["a/f"] = [z["arg"] / z["frame"] for z in all_data] cc["a/(f*t/s)"] = [ z["arg"] / (z["frame"] * z["tok"] / z["sent"]) for z in all_data ] cc["AO"] = [z["arg_overlapped"] / z["arg"] for z in all_data] cc["AO1"] = [z["arg_overlapped_R1"] / z["arg_R1"] for z in all_data] # others cc["frame_wlen"] = [_show_entry(z["frame_wlen"]) for z in all_data] cc["frame_trigger_pos"] = [ _show_entry(z["frame_trigger_pos"]) for z in all_data ] cc["frame_type"] = [_show_entry(z["frame_type"]) for z in all_data] cc["frame_type0"] = [_show_entry(z["frame_type0"]) for z in all_data] cc["arg_wlen_m30"] = [_show_entry(z["arg_wlen_m30"]) for z in all_data] cc["arg_role"] = [_show_entry(z["arg_role"]) for z in all_data] cc["arg_repeat"] = [_show_entry(z["arg_repeat"]) for z in all_data] cc["arg_repeatR"] = [_show_entry(z["arg_repeatR"]) for z in all_data] # -- def left_justified(df): try: formatters = {} for li in list(df.columns): max = df[li].str.len().max() form = "{{:<{}s}}".format(max) formatters[li] = functools.partial(str.format, form) return df.to_string(formatters=formatters) except: return df.to_string() # -- dd = pd.DataFrame(cc, index=all_data_names) for group_name, group_keys in all_groups.items(): zlog(f"#== GROUP {group_name}\n" + left_justified(dd[group_keys]))
def __init__(self, file: str): self.file = file self.fd = zopen(file, 'wb') zlog(f"Open file for OutputHelper writing: {file}({self.fd})")