コード例 #1
0
ファイル: test_split.py プロジェクト: tswsxk/longling
def test_train_test(tmp_path, xy_path, ratio):
    x_path, y_path = xy_path
    if isinstance(ratio, tuple):
        train_test(x_path,
                   y_path,
                   train_size=ratio[0],
                   test_size=ratio[1],
                   prefix=str(tmp_path) + "/")
    else:
        train_test(x_path, y_path, prefix=str(tmp_path) + "/", ratio=ratio)
    for x, y in zip(loading(tmp_path / "x.train.jsonl"),
                    loading(tmp_path / "y.train.txt", src_type="text")):
        assert x["z"] == int(y.strip())

    assert len(list(loading(tmp_path / "x.test.jsonl"))) == 2
コード例 #2
0
def get_best(src: (PATH_TYPE, list),
             *keys,
             with_keys: (str, None) = None,
             with_all=False,
             cmp=lambda x, y: x > y,
             merge=True):
    keys = as_list(keys)

    with_keys = [] if with_keys is None else with_keys.split(";")

    result = {key: None for key in keys}

    result_appendix = {key: None for key in keys}

    for data in loading(src):
        for key in result:
            _data = get_by_key(data, parsed_key=key)
            if result[key] is None or cmp(_data, result[key]):
                result[key] = _data
                if with_all:
                    result_appendix[key] = data
                elif with_keys:
                    result_appendix[key] = {
                        _key: get_by_key(data, _key)
                        for _key in with_keys
                    }

    if merge:
        return _merge(result,
                      result_appendix if with_all or with_keys else None)
    else:
        return result, result_appendix if with_all or with_keys else None
コード例 #3
0
ファイル: iterator.py プロジェクト: tswsxk/longling
 def reset(self):
     self._reset = lambda: loading(self.cache_file, "jsonl")
     if self.cache_thread is not None:
         self.cache_thread.join()
         self.cache_thread = None
         self.cache_queue = None
     super(CacheAsyncLoopIter, self).reset()
コード例 #4
0
def to_board(src, board_dir, global_step_field, *scalar_fields):
    with warnings.catch_warnings(record=True):
        from tensorboardX import SummaryWriter

    with SummaryWriter(board_dir) as sw, print_time(
            "to_board: %s -> %s\n step field: %s, fields: %s" %
        (src, board_dir, global_step_field, scalar_fields)):
        for line in loading(src):
            for scalar_field in scalar_fields:
                sw.add_scalar(tag=scalar_field,
                              scalar_value=get_by_key(line, scalar_field),
                              global_step=int(
                                  get_by_key(line, global_step_field)))
コード例 #5
0
ファイル: etl.py プロジェクト: tswsxk/CangJie
def extract(data_src, embeddings):
    word_feature = []
    word_radical_feature = []
    char_feature = []
    char_radical_feature = []
    features = [
        word_feature, word_radical_feature, char_feature, char_radical_feature
    ]
    labels = []
    for ds in tqdm(loading(data_src), "loading data from %s" % data_src):
        label = ds['label']

        w = token_to_idx(embeddings["w"],
                         ds["w"]) if embeddings.get("w") and "w" in ds else []
        rw = token_to_idx(
            embeddings["rw"],
            ds["rw"]) if embeddings.get("rw") and "rw" in ds else []
        c = token_to_idx(embeddings["c"],
                         ds["c"]) if embeddings.get("c") and "c" in ds else []
        rc = token_to_idx(
            embeddings["rw"],
            ds["rw"]) if embeddings.get("rc") and "rc" in ds else []

        if len(w) < 1:
            continue

        try:
            if rw:
                assert len(w) == len(rw), "some word miss radical"
            if rc:
                assert len(c) == len(rc), "some char miss radical"
        except AssertionError as e:
            warnings.warn("%s" % e)
            continue
        word_feature.append(w)
        word_radical_feature.append(rw)
        char_feature.append(c)
        char_radical_feature.append(rc)
        labels.append(label)

    return features, labels
コード例 #6
0
ファイル: test_split.py プロジェクト: tswsxk/longling
def test_kfold(tmp_path, xy_path):
    x_path, y_path = xy_path

    kfold(x_path, y_path, prefix=str(tmp_path) + "/")

    for x, y in zip(loading(tmp_path / "x.0.train.jsonl"),
                    loading(tmp_path / "y.0.train.txt")):
        assert x["z"] == int(y.strip())

    assert len(list(loading(tmp_path / "x.0.test.jsonl"))) == 2

    kfold(x_path, y_path)

    x_dir = PurePath(x_path).parent
    y_dir = PurePath(y_path).parent

    for x, y in zip(loading(x_dir / "x.0.train.jsonl"),
                    loading(y_dir / "y.0.train.txt")):
        assert x["z"] == int(y.strip())

    assert len(list(loading(x_dir / "x.0.test.jsonl"))) == 2
コード例 #7
0
ファイル: stroke.py プロジェクト: tswsxk/CangJie
def load_stroke_dict(dict_path) -> dict:
    stroke_dict = {}
    for line in tqdm(loading(dict_path), "loading from %s" % dict_path):
        stroke_dict[line["c"]] = line["s"]
    return stroke_dict
コード例 #8
0
ファイル: features.py プロジェクト: tswsxk/CangJie
def load_dict(dict_path):
    cdict = {}
    for line in loading(dict_path):
        cdict[line[CHAR]] = {}
        cdict[line[CHAR]][RADICAL] = line[RADICAL]
    return cdict
コード例 #9
0
ファイル: iterator.py プロジェクト: tswsxk/longling
 def src():
     return loading(self.cache_file, "jsonl")