Example #1
0
def prepare_ranking_file(src,
                         tar,
                         item_num,
                         threshold=None,
                         sampling_num=None,
                         unified_num=False,
                         excluded_files=None):
    user_items = {}
    with as_io(src) as f:
        for line in tqdm(f, "preparing ranking file"):
            user, item, rating = json.loads(line)
            user = int(user)
            item = int(item)
            if user not in user_items:
                user_items[user] = [[], [], []]  # like, unlabeled, dislike
            rating = float(rating)
            if threshold is not None:
                rating = 0 if rating <= threshold else 1
                pos = 0 if rating == 1 else 2
            else:
                pos = 0

            user_items[user][pos].append(item)

    excluded_user_items = defaultdict(set)
    if excluded_files:
        with as_io(excluded_files) as f:
            for line in f:
                user, item, _ = json.loads(line)
                user = int(user)
                item = int(item)
                excluded_user_items[user].add(item)

    for user, items in tqdm(user_items.items(), "sampling"):
        current_items = set(items[0]) | set(items[2]) | set(items[1])
        unlabeled = set(
            range(item_num)) - current_items - excluded_user_items.get(
                user, set())
        if sampling_num:
            if unified_num:
                _sampling_num = sampling_num - len(current_items)
            else:
                _sampling_num = sampling_num
            items[1].extend(random.sample(unlabeled, _sampling_num))
        else:
            items[1].extend(list(unlabeled))

    with as_out_io(tar) as wf:
        for user, items in tqdm(user_items.items(), "write to %s" % tar):
            _data = [user] + items
            print(json.dumps(_data), file=wf)
Example #2
0
def load_jsonl(src: PATH_IO_TYPE):
    """
    缓冲式按行读取jsonl文件

    Examples
    --------

    Assume such component is written in demo.jsonl:

    .. code-block::

        {"a": 1}
        {"a": 2}


    .. code-block:: python

        for line in load_jsonl('demo.jsonl'):
            print(line)


    .. code-block::

        {"a": 1}
        {"a": 2}
    """
    with as_io(src) as f:
        for line in f:
            yield json.loads(line)
Example #3
0
def load_csv(src: PATH_IO_TYPE, delimiter=",", **kwargs):
    """
    read the dict from csv

    Examples
    --------

    Assume such component is written in demo.csv:

    .. code-block::

        a,b,c
        1,2,3
        2,4,6


    .. code-block:: python

        for line in load_csv('demo.csv'):
            print(line)


    .. code-block::

        {"a": 1, "b": 2, "c": 3}
        {"a": 2, "b": 4, "c": 6}
    """
    with as_io(src) as f:
        field_names = [i for i in csv.reader([f.readline()], delimiter=delimiter, **kwargs)][0]

        for line in csv.DictReader(f, field_names, delimiter=delimiter, **kwargs):
            yield line
Example #4
0
def load_file(src: PATH_IO_TYPE):
    """
    Read raw text from source

    Examples
    --------
    Assume such component is written in demo.txt:

    .. code-block::

        hello
        world

    use following codes to reading the component

    .. code-block:: python

        for line in load_csv('demo.txt'):
            print(line, end="")

    and get

    .. code-block::

        hello
        world
    """
    with as_io(src) as f:
        for line in f:
            yield line
Example #5
0
File: etl.py Project: tswsxk/xrec
def extract_eval(src):
    src_data = []
    with as_io(src) as f:
        for line in f:
            user, like, unlabeled, dislike = json.loads(line)
            src_data.append([user, like, unlabeled, dislike])
    return src_data
Example #6
0
def test_loading(tmpdir):
    csv_src = path_append(tmpdir, "test.csv")
    json_src = path_append(tmpdir, "test.json")

    text_to_csv(csv_src)
    csv2jsonl(csv_src, json_src)
    jsonl2csv(json_src, csv_src)

    for src in [csv_src, json_src, load_jsonl(json_src)]:
        for i, line in enumerate(loading(src)):
            assert int(line["id"]) == i, line
            if i == 0:
                assert line["name"] == "Tom", line
            elif i == 1:
                assert line["name"] == "Jerry", line

    src = path_append(tmpdir, "test")
    with as_out_io(src) as wf:
        print(DEMO_TEXT.strip(), file=wf)

    assert [line.strip()
            for line in loading(src)] == DEMO_TEXT.strip().split("\n")
    with as_io(src) as f:
        assert [line.strip()
                for line in loading(f)] == DEMO_TEXT.strip().split("\n")
    assert "hello world" == loading(lambda: "hello world")
Example #7
0
File: etl.py Project: tswsxk/xrec
def extract(data_src):
    user_item_rating = []
    with as_io(data_src) as f:
        for line in tqdm(f, "extracting file"):
            _user_item_rating = []
            user_id, item_id, rating = json.loads(line)
            _user_item_rating.append(int(user_id))
            _user_item_rating.append(int(item_id))
            if int(rating) <= 3:
                _user_item_rating.append(0)
            else:
                _user_item_rating.append(1)
            user_item_rating.append(_user_item_rating)
    return user_item_rating
Example #8
0
def load_ks_from_csv(edges):
    with as_io(edges) as f:
        for line in csv.reader(f, delimiter=","):
            yield line
Example #9
0
def movielens(src, tar, separator):
    with as_io(src) as f, as_out_io(tar) as wf:
        for line in tqdm(f, "reformatting from %s to %s" % (src, tar)):
            user, item, rating, _ = line.strip().split(separator)
            print(json.dumps([int(user), int(item), int(rating)]), file=wf)
Example #10
0
 def iter_from_file():
     with as_io(filename) as f:
         for line in f:
             yield json.loads(line)