def xy_path(tmp_path_factory): tmp_dir = tmp_path_factory.mktemp("data") _x_path = tmp_dir / "x.jsonl" _y_path = tmp_dir / "y.txt" with as_out_io(_x_path) as xf, as_out_io(_y_path) as yf: for i in range(10): print(json.dumps({ "x": [j for j in range(i, i + 5)], "z": i }), file=xf) print(i, file=yf) return _x_path, _y_path
def test_loading(tmpdir): csv_src = path_append(tmpdir, "test.csv") json_src = path_append(tmpdir, "test.json") text_to_csv(csv_src) csv2jsonl(csv_src, json_src) jsonl2csv(json_src, csv_src) for src in [csv_src, json_src, load_jsonl(json_src)]: for i, line in enumerate(loading(src)): assert int(line["id"]) == i, line if i == 0: assert line["name"] == "Tom", line elif i == 1: assert line["name"] == "Jerry", line src = path_append(tmpdir, "test") with as_out_io(src) as wf: print(DEMO_TEXT.strip(), file=wf) assert [line.strip() for line in loading(src)] == DEMO_TEXT.strip().split("\n") with as_io(src) as f: assert [line.strip() for line in loading(f)] == DEMO_TEXT.strip().split("\n") assert "hello world" == loading(lambda: "hello world")
def get_s_indices(*files, ratio: (str, list) = None, index_file=None, s_indices: (PATH_TYPE, list) = None, shuffle=True, random_state=None): if s_indices is not None: if isinstance(s_indices, PATH_TYPE): s_indices = [ set(_s_indices) for _s_indices in json_load(s_indices) ] elif ratio is not None: ratio = _ratio_list(ratio) src = zip(*[loading(_file) for _file in files]) indices = [i for i, _ in enumerate(src)] if shuffle: indices = shuffle_indices(indices, random_state) s_indices = _get_s_indices(indices, ratio) else: raise ValueError("ratio or s_indices should be specified") if index_file is not None: with as_out_io(index_file) as wf: json.dump([list(_s_indices) for _s_indices in s_indices], wf) return s_indices
def test_copy(tmpdir): src_dir = path_append(tmpdir, "src") tar_dir = path_append(tmpdir, "tar") src = path_append(src_dir, "src.txt") tar = path_append(tar_dir, "tar.txt") with as_out_io(src) as wf: print("hello world", file=wf) config.OVERRIDE = False copytree(src_dir, tar_dir) copytree(src_dir, tar_dir) copyfile(src, tar) template_copy(src, tar) config.OVERRIDE = True copytree(src_dir, tar_dir) copyfile(src, tar) config.OVERRIDE = None with simulate_stdin("y", "y"): copytree(src_dir, tar_dir) copyfile(src, tar) with simulate_stdin("n", "n"): copytree(src_dir, tar_dir) copyfile(src, tar) with simulate_stdin("unk", "y"): default_legal_input("", __legal_input={"y"})
def prepare_ranking_file(src, tar, item_num, threshold=None, sampling_num=None, unified_num=False, excluded_files=None): user_items = {} with as_io(src) as f: for line in tqdm(f, "preparing ranking file"): user, item, rating = json.loads(line) user = int(user) item = int(item) if user not in user_items: user_items[user] = [[], [], []] # like, unlabeled, dislike rating = float(rating) if threshold is not None: rating = 0 if rating <= threshold else 1 pos = 0 if rating == 1 else 2 else: pos = 0 user_items[user][pos].append(item) excluded_user_items = defaultdict(set) if excluded_files: with as_io(excluded_files) as f: for line in f: user, item, _ = json.loads(line) user = int(user) item = int(item) excluded_user_items[user].add(item) for user, items in tqdm(user_items.items(), "sampling"): current_items = set(items[0]) | set(items[2]) | set(items[1]) unlabeled = set( range(item_num)) - current_items - excluded_user_items.get( user, set()) if sampling_num: if unified_num: _sampling_num = sampling_num - len(current_items) else: _sampling_num = sampling_num items[1].extend(random.sample(unlabeled, _sampling_num)) else: items[1].extend(list(unlabeled)) with as_out_io(tar) as wf: for user, items in tqdm(user_items.items(), "write to %s" % tar): _data = [user] + items print(json.dumps(_data), file=wf)
def jsonl2csv(src: PATH_IO_TYPE, tar: PATH_IO_TYPE = None, delimiter=",", **kwargs): """ 将 json 格式文件/io流 转换为 csv 格式文件/io流 transfer json file or io stream into csv file or io stream Parameters ---------- src: PATH_IO_TYPE 数据源,可以是文件路径,也可以是一个IO流。 the path to source file or io stream. tar: PATH_IO_TYPE 输出目标,可以是文件路径,也可以是一个IO流。 the path to target file or io stream. delimiter: str 分隔符 the delimiter used in csv. some usually used delimiters are "," and " " kwargs: dict options passed to csv.DictWriter Examples -------- Assume such component is written in demo.csv: .. code-block:: {'column1': 'hello', 'column2': 'world'} {'column1': 'hello', 'column2': 'you'} use following codes to reading the component .. code-block:: python jsonl2csv("demo.csv", "demo.jsonl") and get .. code-block:: column1,column2 hello,world hello,you """ with as_out_io(tar) as wf: csv_writer = None for line in tqdm(load_jsonl(src), "json2csv: %s --> %s" % (src, tar)): if csv_writer is None: csv_writer = csv.DictWriter(wf, line.keys(), delimiter=delimiter, **kwargs) csv_writer.writeheader() csv_writer.writerow(line)
def test_tl_json(tmpdir): tl_file = str(tmpdir / "demo.tl") json_file = str(tmpdir / "demo.json") with as_out_io(tl_file) as wf: print(TL_STR, file=wf, end='') tl2json(tl_file, json_file) json2tl(json_file, tl_file) with open(tl_file) as f: assert f.read() == TL_STR tl2json(tl_file, json_file, left_shift=True)
def csv2jsonl(src: PATH_IO_TYPE, tar: PATH_IO_TYPE = None, delimiter=",", **kwargs): """ 将 csv 格式文件/io流 转换为 json 格式文件/io流 transfer csv file or io stream into json file or io stream Parameters ---------- src: PATH_IO_TYPE 数据源,可以是文件路径,也可以是一个IO流。 the path to source file or io stream. tar: PATH_IO_TYPE 输出目标,可以是文件路径,也可以是一个IO流。 the path to target file or io stream. delimiter: str 分隔符 the delimiter used in csv. some usually used delimiters are "," and " " kwargs: dict options passed to csv.DictWriter Examples -------- Assume such component is written in demo.csv: .. code-block:: column1,column2 hello,world hello,you use following codes to reading the component .. code-block:: python csv2json("demo.csv", "demo.jsonl") and get .. code-block:: {'column1': 'hello', 'column2': 'world'} {'column1': 'hello', 'column2': 'you'} """ with as_out_io(tar) as wf: for line in tqdm(load_csv(src, delimiter=delimiter, **kwargs), "csv2json: %s --> %s" % (src, tar)): print(json.dumps(line, ensure_ascii=False), file=wf)
def test_graph(shared_data_dir, tmpdir): demo_response = [ [[0, 1], [1, 0], [1, 1], [2, 0]], [[0, 0], [0, 0], [0, 1], [2, 0]], [[1, 1], [2, 0], [2, 1], [3, 1]], [[0, 1], [1, 1], [2, 0], [2, 1]], [[2, 0], [1, 0], [0, 1], [1, 1]], ] tmpfile = path_append(tmpdir, "demo.json", to_str=True) with as_out_io(tmpfile) as wf: for seq in demo_response: print(json.dumps(seq), file=wf) dense_graph_path = path_append(tmpdir, "dense_graph.json", to_str=True) _dense_graph = dense_graph(4, dense_graph_path) assert len(_dense_graph) == 12 trans_graph = path_append(tmpdir, "transition_graph", to_str=True) transition_graph(4, tmpfile, tar=trans_graph) ctrans_count_graph = path_append(tmpdir, "correct_transition_count_graph", to_str=True) correct_transition_count_graph(4, tmpfile, tar=ctrans_count_graph) ctrans_graph = path_append(tmpdir, "correct_transition_graph", to_str=True) correct_transition_graph(4, tmpfile, tar=ctrans_graph) pcp_graph = path_append(tmpdir, "posterior_correct_probability_graph", to_str=True) posterior_correct_probability_graph(4, tmpfile, tar=pcp_graph) ctrans_sim = path_append(shared_data_dir, "correct_transition_sim_graph", to_str=True) similarity_graph(4, ctrans_graph, ctrans_sim)
def text_to_csv(path): with as_out_io(path) as wf: print(DEMO_TEXT.strip(), file=wf)
def movielens(src, tar, separator): with as_io(src) as f, as_out_io(tar) as wf: for line in tqdm(f, "reformatting from %s to %s" % (src, tar)): user, item, rating, _ = line.strip().split(separator) print(json.dumps([int(user), int(item), int(rating)]), file=wf)