Example #1
0
    def _from_json(self):
        vocab_file = "qna_data/{}_vocab.json".format(self.method)
        dataset_file = "qna_data/{}_dataset.json".format(self.method)

        self.vocab = VocabEntry.from_json(vocab_file)

        dataset_json = read_json_data(dataset_file)

        for key in self.train_keys:
            setattr(self, key, dataset_json[key])

        self._to_numpy()
Example #2
0
    def preprocess_qna_data(
        self, method, cased, dataset_types,
    ):
        folder_name = "{}_{}".format(method, cased)
        folder_path = "qna_data/pre_data/vi_{}".format(folder_name)
        create_folder(folder_path)

        # preprocess fields
        dataset_features_columns = {}
        for dataset_type in dataset_types:
            data_file = "qna_data/vi_{}.json".format(dataset_type)

            # Init features columns
            if self.for_train:
                features_columns = {
                    "id": [],
                    "question": [],
                    "text": [],
                    "label": [],
                    "pid": [],
                }

            json_samples = read_json_data(data_file)

            for json_sample in json_samples:

                if self.for_train:
                    features_columns["id"].append(json_sample["id"])
                    features_columns["label"].append(1 if json_sample["label"] else 0)
                    features_columns["pid"].append(json_sample["pid"])

                for key in ["question", "text"]:
                    pre_key = "{}_{}_{}".format(
                        method, cased, key
                    )
                    pre_text, tokens = self.pre_process_text(
                        json_sample[key], method, cased, self.for_train, key
                    )
                    json_sample[pre_key] = pre_text

                    if self.for_train:
                        features_columns[key].append(tokens)

            # samples with preprocessed keys
            write_json_data(data_file, json_samples)
            print ("{}. Length {}. Done write to file {}".format(
                dataset_type, len(json_samples), data_file
            ))

            # save for writing later when we have vocab
            if self.for_train:
                dataset_features_columns[dataset_type] = features_columns

        # build vocab
        vocab_file = "{}/vocab.json".format(folder_path, dataset_type)
        if self.build_vocab:
            self._build_vocab(vocab_file, method, cased)
        else:
            self.vocab = VocabEntry.from_json(vocab_file)

        # write configs
        configs = {
            "vocab_size": len(self.vocab),
            "question_size": self.question_size,
            "text_size": self.text_size,
        }
        configs_file = "{}/configs.json".format(folder_path)
        write_json_data(configs_file, configs)
        print ("Done wirte config file {}".format(configs_file))

        # write features columns
        # generate featured dataset
        if self.for_train:
            for dataset_type, features_columns in dataset_features_columns.items():
                self.write_features_columns(
                    features_columns, folder_name, dataset_type
                )