def test_train_test(tmp_path, xy_path, ratio): x_path, y_path = xy_path if isinstance(ratio, tuple): train_test(x_path, y_path, train_size=ratio[0], test_size=ratio[1], prefix=str(tmp_path) + "/") else: train_test(x_path, y_path, prefix=str(tmp_path) + "/", ratio=ratio) for x, y in zip(loading(tmp_path / "x.train.jsonl"), loading(tmp_path / "y.train.txt", src_type="text")): assert x["z"] == int(y.strip()) assert len(list(loading(tmp_path / "x.test.jsonl"))) == 2
def get_best(src: (PATH_TYPE, list), *keys, with_keys: (str, None) = None, with_all=False, cmp=lambda x, y: x > y, merge=True): keys = as_list(keys) with_keys = [] if with_keys is None else with_keys.split(";") result = {key: None for key in keys} result_appendix = {key: None for key in keys} for data in loading(src): for key in result: _data = get_by_key(data, parsed_key=key) if result[key] is None or cmp(_data, result[key]): result[key] = _data if with_all: result_appendix[key] = data elif with_keys: result_appendix[key] = { _key: get_by_key(data, _key) for _key in with_keys } if merge: return _merge(result, result_appendix if with_all or with_keys else None) else: return result, result_appendix if with_all or with_keys else None
def reset(self): self._reset = lambda: loading(self.cache_file, "jsonl") if self.cache_thread is not None: self.cache_thread.join() self.cache_thread = None self.cache_queue = None super(CacheAsyncLoopIter, self).reset()
def to_board(src, board_dir, global_step_field, *scalar_fields): with warnings.catch_warnings(record=True): from tensorboardX import SummaryWriter with SummaryWriter(board_dir) as sw, print_time( "to_board: %s -> %s\n step field: %s, fields: %s" % (src, board_dir, global_step_field, scalar_fields)): for line in loading(src): for scalar_field in scalar_fields: sw.add_scalar(tag=scalar_field, scalar_value=get_by_key(line, scalar_field), global_step=int( get_by_key(line, global_step_field)))
def extract(data_src, embeddings): word_feature = [] word_radical_feature = [] char_feature = [] char_radical_feature = [] features = [ word_feature, word_radical_feature, char_feature, char_radical_feature ] labels = [] for ds in tqdm(loading(data_src), "loading data from %s" % data_src): label = ds['label'] w = token_to_idx(embeddings["w"], ds["w"]) if embeddings.get("w") and "w" in ds else [] rw = token_to_idx( embeddings["rw"], ds["rw"]) if embeddings.get("rw") and "rw" in ds else [] c = token_to_idx(embeddings["c"], ds["c"]) if embeddings.get("c") and "c" in ds else [] rc = token_to_idx( embeddings["rw"], ds["rw"]) if embeddings.get("rc") and "rc" in ds else [] if len(w) < 1: continue try: if rw: assert len(w) == len(rw), "some word miss radical" if rc: assert len(c) == len(rc), "some char miss radical" except AssertionError as e: warnings.warn("%s" % e) continue word_feature.append(w) word_radical_feature.append(rw) char_feature.append(c) char_radical_feature.append(rc) labels.append(label) return features, labels
def test_kfold(tmp_path, xy_path): x_path, y_path = xy_path kfold(x_path, y_path, prefix=str(tmp_path) + "/") for x, y in zip(loading(tmp_path / "x.0.train.jsonl"), loading(tmp_path / "y.0.train.txt")): assert x["z"] == int(y.strip()) assert len(list(loading(tmp_path / "x.0.test.jsonl"))) == 2 kfold(x_path, y_path) x_dir = PurePath(x_path).parent y_dir = PurePath(y_path).parent for x, y in zip(loading(x_dir / "x.0.train.jsonl"), loading(y_dir / "y.0.train.txt")): assert x["z"] == int(y.strip()) assert len(list(loading(x_dir / "x.0.test.jsonl"))) == 2
def load_stroke_dict(dict_path) -> dict: stroke_dict = {} for line in tqdm(loading(dict_path), "loading from %s" % dict_path): stroke_dict[line["c"]] = line["s"] return stroke_dict
def load_dict(dict_path): cdict = {} for line in loading(dict_path): cdict[line[CHAR]] = {} cdict[line[CHAR]][RADICAL] = line[RADICAL] return cdict
def src(): return loading(self.cache_file, "jsonl")