コード例 #1
0
ファイル: JsonFromFiles.py プロジェクト: haoxizhong/plm_scm
    def __init__(self, config, mode, encoding="utf8", *args, **params):
        self.config = config
        self.mode = mode
        self.file_list = []
        self.data_path = config.get("data", "%s_data_path" % mode)
        self.encoding = encoding

        filename_list = config.get("data", "%s_file_list" % mode).replace(
            " ", "").split(",")
        recursive = False

        for name in filename_list:
            self.file_list = self.file_list + dfs_search(
                os.path.join(self.data_path, name), recursive)
        self.file_list.sort()

        self.data = []
        for filename in self.file_list:
            f = open(filename, "r", encoding=encoding)
            for line in f:
                self.data.append(json.loads(line))

        if mode == "train":
            random.shuffle(self.data)

        self.reduce = config.getboolean("data", "reduce")
        if mode != "train":
            self.reduce = False
        if self.reduce:
            self.reduce_ratio = config.getfloat("data", "reduce_ratio")
コード例 #2
0
    def __init__(self, config, mode, encoding="utf8", *args, **params):
        self.config = config
        self.mode = mode
        self.file_list = []
        self.data_path = config.get("data", "%s_data_path" % mode)
        self.encoding = encoding

        filename_list = config.get("data", "%s_file_list" % mode).replace(
            " ", "").split(",")
        recursive = config.getboolean("data", "recursive")

        for name in filename_list:
            self.file_list = self.file_list + dfs_search(
                os.path.join(self.data_path, name), recursive)
        self.file_list.sort()

        self.json_format = config.get("data", "json_format")

        self.total = 0

        for filename in self.file_list:
            if self.json_format == "single":
                data = json.load(open(filename, "r", encoding=encoding))
                for a in range(0, len(data)):
                    if self.check(data[a]):
                        self.total += 1
            else:
                f = open(filename, "r", encoding=encoding)
                for line in f:
                    data = json.loads(line)
                    if self.check(data):
                        self.total += 1
                f.close()

        self.init_zero()
コード例 #3
0
ファイル: JsonFromFiles.py プロジェクト: thunlp/QAJudge
    def __init__(self, config, mode, encoding="utf8", *args, **params):
        self.config = config
        self.mode = mode
        self.file_list = []
        self.data_path = config.get("data", "%s_data_path" % mode)
        self.encoding = encoding

        filename_list = config.get("data", "%s_file_list" % mode).replace(
            " ", "").split(",")
        recursive = config.getboolean("data", "recursive")

        for name in filename_list:
            self.file_list = self.file_list + dfs_search(
                os.path.join(self.data_path, name), recursive)
        self.file_list.sort()

        self.load_mem = config.getboolean("data", "load_into_mem")
        self.json_format = config.get("data", "json_format")

        self.data = []
        for filename in self.file_list:
            if self.json_format == "single":
                self.data = self.data + json.load(
                    open(filename, "r", encoding=encoding))
            else:
                f = open(filename, "r", encoding=encoding)
                for line in f:
                    self.data.append(json.loads(line))

        self.filter_data()
        print(len(self.data))
コード例 #4
0
    def __init__(self, config, mode, encoding="utf8", *args, **params):
        self.config = config
        self.mode = mode
        self.file_list = []
        self.data_path = config.get("data", "%s_data_path" % mode)
        self.encoding = encoding

        filename_list = config.get("data", "%s_file_list" % mode).replace(" ", "").split(",")
        recursive = config.getboolean("data", "recursive")

        for name in filename_list:
            self.file_list = self.file_list + dfs_search(os.path.join(self.data_path, name), recursive)
        self.file_list.sort()

        self.load_mem = config.getboolean("data", "load_into_mem")
        self.json_format = config.get("data", "json_format")

        if self.load_mem:
            self.data = []
            for filename in self.file_list:
                if self.json_format == "single":
                    self.data = self.data + json.load(open(filename, "r", encoding=encoding))
                else:
                    f = open(filename, "r", encoding=encoding)
                    for line in f:
                        self.data.append(json.loads(line))

        else:
            self.total = 0
            self.prefix_file_cnt = []

            if self.json_format == "single":
                self.temp_data = {
                    "data": json.load(open(self.file_list[0], "r", encoding=encoding)),
                    "file_id": 0
                }
            else:
                self.temp_file_list = []

            for filename in self.file_list:
                if self.json_format == "single":
                    data = json.load(open(filename, "r", encoding=encoding))
                    self.prefix_file_cnt.append(len(data))
                else:
                    f = open(filename, "r", encoding=encoding)
                    cnt = 0
                    for line in f:
                        cnt += 1
                    f.close()
                    self.temp_file_list.append({
                        "file": open(filename, "r", encoding=encoding),
                        "cnt": 0
                    })
                    self.prefix_file_cnt.append(cnt)

            for a in range(1, len(self.prefix_file_cnt)):
                self.prefix_file_cnt[a] += self.prefix_file_cnt[a - 1]
            self.total = self.prefix_file_cnt[-1]
コード例 #5
0
    def __init__(self, config, mode, *args, **params):
        self.config = config
        self.mode = mode
        self.file_list = []
        self.data_path = config.get("data", "%s_data_path" % mode)

        filename_list = config.get("data", "%s_file_list" % mode).replace(" ", "").split(",")
        recursive = config.getboolean("data", "recursive")

        for name in filename_list:
            self.file_list = self.file_list + dfs_search(name, recursive)
        self.file_list.sort()
コード例 #6
0
    def __init__(self, config, mode, encoding="utf8", *args, **params):
        self.config = config
        self.mode = mode
        self.file_list = []
        self.data_path = config.get("data", "%s_data_path" % mode)
        self.encoding = encoding

        filename_list = config.get("data", "%s_file_list" % mode).replace(" ", "").split(",")

        for name in filename_list:
            self.file_list = self.file_list + dfs_search(os.path.join(self.data_path, name), True)
        self.file_list.sort()

        self.data = []
        for filename in self.file_list:
            self.data = self.data + json.load(open(filename, "r", encoding=encoding))
コード例 #7
0
ファイル: JsonFromFiles.py プロジェクト: awesome-archive/CAIL
    def __init__(self, config, mode, encoding="utf8", *args, **params):
        self.config = config
        self.mode = mode
        self.file_list = []
        self.data_path = config.get("data", "%s_data_path" % mode)
        self.encoding = encoding
        # self.siglemulti = SingleMulti('gbt/statement_tfidf.model', 'gbt/statement_som_gbt.model')

        if mode != "test":
            filename_list = config.get("data", "%s_file_list" % mode).replace(
                " ", "").split(",")
        else:
            filename_list = "/input/"

        recursive = False

        for name in filename_list:
            self.file_list = self.file_list + dfs_search(
                os.path.join(self.data_path, name), recursive)
        self.file_list.sort()

        self.data = []
        for filename in self.file_list:
            f = open(filename, "r", encoding=encoding)
            for line in f:
                data = json.loads(line)
                if mode == "test":
                    self.data.append(json.loads(line))
                    continue
                # filter dataset for Single option model and Multiple option model.
                # clean up answers.
                data["answer"] = [a for a in data["answer"] if a != "。"]
                self.data.append(json.loads(line))

        if mode == "train":
            random.shuffle(self.data)

        self.reduce = config.getboolean("data", "reduce")
        if mode != "train":
            self.reduce = False
        if self.reduce:
            self.reduce_ratio = config.getfloat("data", "reduce_ratio")
コード例 #8
0
ファイル: preprocess.py プロジェクト: yueyedeai/CAIL
import json
import os
import pandas

from tools.dataset_tool import dfs_search

data_path = "../input/"
recursive = False
file_list = []
file_list = file_list + dfs_search(os.path.join(data_path, ''), recursive)
file_list = [file for file in file_list if 'train' in file]
file_list.sort()

rawinput = []
for filename in file_list:
    f = open(filename, "r", encoding='utf8')
    for line in f:
        data = json.loads(line)
        # filter dataset for Single option model and Multiple option model.
        # clean up answers.
        data["answer"] = [a for a in data["answer"] if a != "。"]
        rawinput.append(json.loads(line))

df = pandas.DataFrame(columns=["q", "a", "r"])
for item in rawinput:
    for option in list("ABCD"):
        x = dict(
            zip(list("qar"),
                (item['statement'], item['option_list'][option], option
                 in item['answer'])))
        df = df.append(x, ignore_index=True)