コード例 #1
0
ファイル: test_split.py プロジェクト: tswsxk/longling
def xy_path(tmp_path_factory):
    tmp_dir = tmp_path_factory.mktemp("data")
    _x_path = tmp_dir / "x.jsonl"
    _y_path = tmp_dir / "y.txt"
    with as_out_io(_x_path) as xf, as_out_io(_y_path) as yf:
        for i in range(10):
            print(json.dumps({
                "x": [j for j in range(i, i + 5)],
                "z": i
            }),
                  file=xf)
            print(i, file=yf)

    return _x_path, _y_path
コード例 #2
0
def test_loading(tmpdir):
    csv_src = path_append(tmpdir, "test.csv")
    json_src = path_append(tmpdir, "test.json")

    text_to_csv(csv_src)
    csv2jsonl(csv_src, json_src)
    jsonl2csv(json_src, csv_src)

    for src in [csv_src, json_src, load_jsonl(json_src)]:
        for i, line in enumerate(loading(src)):
            assert int(line["id"]) == i, line
            if i == 0:
                assert line["name"] == "Tom", line
            elif i == 1:
                assert line["name"] == "Jerry", line

    src = path_append(tmpdir, "test")
    with as_out_io(src) as wf:
        print(DEMO_TEXT.strip(), file=wf)

    assert [line.strip()
            for line in loading(src)] == DEMO_TEXT.strip().split("\n")
    with as_io(src) as f:
        assert [line.strip()
                for line in loading(f)] == DEMO_TEXT.strip().split("\n")
    assert "hello world" == loading(lambda: "hello world")
コード例 #3
0
ファイル: split.py プロジェクト: tswsxk/longling
def get_s_indices(*files,
                  ratio: (str, list) = None,
                  index_file=None,
                  s_indices: (PATH_TYPE, list) = None,
                  shuffle=True,
                  random_state=None):
    if s_indices is not None:
        if isinstance(s_indices, PATH_TYPE):
            s_indices = [
                set(_s_indices) for _s_indices in json_load(s_indices)
            ]

    elif ratio is not None:
        ratio = _ratio_list(ratio)

        src = zip(*[loading(_file) for _file in files])
        indices = [i for i, _ in enumerate(src)]

        if shuffle:
            indices = shuffle_indices(indices, random_state)

        s_indices = _get_s_indices(indices, ratio)
    else:
        raise ValueError("ratio or s_indices should be specified")

    if index_file is not None:
        with as_out_io(index_file) as wf:
            json.dump([list(_s_indices) for _s_indices in s_indices], wf)

    return s_indices
コード例 #4
0
def test_copy(tmpdir):
    src_dir = path_append(tmpdir, "src")
    tar_dir = path_append(tmpdir, "tar")

    src = path_append(src_dir, "src.txt")
    tar = path_append(tar_dir, "tar.txt")

    with as_out_io(src) as wf:
        print("hello world", file=wf)

    config.OVERRIDE = False
    copytree(src_dir, tar_dir)
    copytree(src_dir, tar_dir)
    copyfile(src, tar)
    template_copy(src, tar)

    config.OVERRIDE = True
    copytree(src_dir, tar_dir)
    copyfile(src, tar)

    config.OVERRIDE = None
    with simulate_stdin("y", "y"):
        copytree(src_dir, tar_dir)
        copyfile(src, tar)

    with simulate_stdin("n", "n"):
        copytree(src_dir, tar_dir)
        copyfile(src, tar)

    with simulate_stdin("unk", "y"):
        default_legal_input("", __legal_input={"y"})
コード例 #5
0
ファイル: prepare_ranking.py プロジェクト: tswsxk/xrec
def prepare_ranking_file(src,
                         tar,
                         item_num,
                         threshold=None,
                         sampling_num=None,
                         unified_num=False,
                         excluded_files=None):
    user_items = {}
    with as_io(src) as f:
        for line in tqdm(f, "preparing ranking file"):
            user, item, rating = json.loads(line)
            user = int(user)
            item = int(item)
            if user not in user_items:
                user_items[user] = [[], [], []]  # like, unlabeled, dislike
            rating = float(rating)
            if threshold is not None:
                rating = 0 if rating <= threshold else 1
                pos = 0 if rating == 1 else 2
            else:
                pos = 0

            user_items[user][pos].append(item)

    excluded_user_items = defaultdict(set)
    if excluded_files:
        with as_io(excluded_files) as f:
            for line in f:
                user, item, _ = json.loads(line)
                user = int(user)
                item = int(item)
                excluded_user_items[user].add(item)

    for user, items in tqdm(user_items.items(), "sampling"):
        current_items = set(items[0]) | set(items[2]) | set(items[1])
        unlabeled = set(
            range(item_num)) - current_items - excluded_user_items.get(
                user, set())
        if sampling_num:
            if unified_num:
                _sampling_num = sampling_num - len(current_items)
            else:
                _sampling_num = sampling_num
            items[1].extend(random.sample(unlabeled, _sampling_num))
        else:
            items[1].extend(list(unlabeled))

    with as_out_io(tar) as wf:
        for user, items in tqdm(user_items.items(), "write to %s" % tar):
            _data = [user] + items
            print(json.dumps(_data), file=wf)
コード例 #6
0
ファイル: loading.py プロジェクト: tswsxk/longling
def jsonl2csv(src: PATH_IO_TYPE, tar: PATH_IO_TYPE = None, delimiter=",", **kwargs):
    """
    将 json 格式文件/io流 转换为 csv 格式文件/io流

    transfer json file or io stream into csv file or io stream

    Parameters
    ----------
    src: PATH_IO_TYPE
        数据源,可以是文件路径,也可以是一个IO流。
        the path to source file or io stream.
    tar: PATH_IO_TYPE
        输出目标,可以是文件路径,也可以是一个IO流。
        the path to target file or io stream.
    delimiter: str
        分隔符
        the delimiter used in csv. some usually used delimiters are ","  and " "
    kwargs: dict
        options passed to csv.DictWriter

    Examples
    --------
    Assume such component is written in demo.csv:

    .. code-block::

        {'column1': 'hello', 'column2': 'world'}
        {'column1': 'hello', 'column2': 'you'}

    use following codes to reading the component

    .. code-block:: python

        jsonl2csv("demo.csv", "demo.jsonl")

    and get

    .. code-block::

        column1,column2
        hello,world
        hello,you
    """
    with as_out_io(tar) as wf:
        csv_writer = None
        for line in tqdm(load_jsonl(src), "json2csv: %s --> %s" % (src, tar)):
            if csv_writer is None:
                csv_writer = csv.DictWriter(wf, line.keys(), delimiter=delimiter, **kwargs)
                csv_writer.writeheader()
            csv_writer.writerow(line)
コード例 #7
0
ファイル: test_format.py プロジェクト: yhdzx123/EduData
def test_tl_json(tmpdir):
    tl_file = str(tmpdir / "demo.tl")
    json_file = str(tmpdir / "demo.json")

    with as_out_io(tl_file) as wf:
        print(TL_STR, file=wf, end='')

    tl2json(tl_file, json_file)
    json2tl(json_file, tl_file)

    with open(tl_file) as f:
        assert f.read() == TL_STR

    tl2json(tl_file, json_file, left_shift=True)
コード例 #8
0
ファイル: loading.py プロジェクト: tswsxk/longling
def csv2jsonl(src: PATH_IO_TYPE, tar: PATH_IO_TYPE = None, delimiter=",", **kwargs):
    """
    将 csv 格式文件/io流 转换为 json 格式文件/io流

    transfer csv file or io stream into  json file or io stream

    Parameters
    ----------
    src: PATH_IO_TYPE
        数据源,可以是文件路径,也可以是一个IO流。
        the path to source file or io stream.
    tar: PATH_IO_TYPE
        输出目标,可以是文件路径,也可以是一个IO流。
        the path to target file or io stream.
    delimiter: str
        分隔符
        the delimiter used in csv. some usually used delimiters are ","  and " "
    kwargs: dict
        options passed to csv.DictWriter

    Examples
    --------
    Assume such component is written in demo.csv:

    .. code-block::

        column1,column2
        hello,world
        hello,you

    use following codes to reading the component

    .. code-block:: python

        csv2json("demo.csv", "demo.jsonl")

    and get

    .. code-block::

        {'column1': 'hello', 'column2': 'world'}
        {'column1': 'hello', 'column2': 'you'}
    """
    with as_out_io(tar) as wf:
        for line in tqdm(load_csv(src, delimiter=delimiter, **kwargs), "csv2json: %s --> %s" % (src, tar)):
            print(json.dumps(line, ensure_ascii=False), file=wf)
コード例 #9
0
def test_graph(shared_data_dir, tmpdir):
    demo_response = [
        [[0, 1], [1, 0], [1, 1], [2, 0]],
        [[0, 0], [0, 0], [0, 1], [2, 0]],
        [[1, 1], [2, 0], [2, 1], [3, 1]],
        [[0, 1], [1, 1], [2, 0], [2, 1]],
        [[2, 0], [1, 0], [0, 1], [1, 1]],
    ]

    tmpfile = path_append(tmpdir, "demo.json", to_str=True)
    with as_out_io(tmpfile) as wf:
        for seq in demo_response:
            print(json.dumps(seq), file=wf)

    dense_graph_path = path_append(tmpdir, "dense_graph.json", to_str=True)
    _dense_graph = dense_graph(4, dense_graph_path)
    assert len(_dense_graph) == 12

    trans_graph = path_append(tmpdir, "transition_graph", to_str=True)
    transition_graph(4, tmpfile, tar=trans_graph)

    ctrans_count_graph = path_append(tmpdir,
                                     "correct_transition_count_graph",
                                     to_str=True)
    correct_transition_count_graph(4, tmpfile, tar=ctrans_count_graph)

    ctrans_graph = path_append(tmpdir, "correct_transition_graph", to_str=True)
    correct_transition_graph(4, tmpfile, tar=ctrans_graph)

    pcp_graph = path_append(tmpdir,
                            "posterior_correct_probability_graph",
                            to_str=True)
    posterior_correct_probability_graph(4, tmpfile, tar=pcp_graph)

    ctrans_sim = path_append(shared_data_dir,
                             "correct_transition_sim_graph",
                             to_str=True)
    similarity_graph(4, ctrans_graph, ctrans_sim)
コード例 #10
0
def text_to_csv(path):
    with as_out_io(path) as wf:
        print(DEMO_TEXT.strip(), file=wf)
コード例 #11
0
ファイル: movielens.py プロジェクト: tswsxk/xrec
def movielens(src, tar, separator):
    with as_io(src) as f, as_out_io(tar) as wf:
        for line in tqdm(f, "reformatting from %s to %s" % (src, tar)):
            user, item, rating, _ = line.strip().split(separator)
            print(json.dumps([int(user), int(item), int(rating)]), file=wf)