def _from_json(self): vocab_file = "qna_data/{}_vocab.json".format(self.method) dataset_file = "qna_data/{}_dataset.json".format(self.method) self.vocab = VocabEntry.from_json(vocab_file) dataset_json = read_json_data(dataset_file) for key in self.train_keys: setattr(self, key, dataset_json[key]) self._to_numpy()
def preprocess_qna_data( self, method, cased, dataset_types, ): folder_name = "{}_{}".format(method, cased) folder_path = "qna_data/pre_data/vi_{}".format(folder_name) create_folder(folder_path) # preprocess fields dataset_features_columns = {} for dataset_type in dataset_types: data_file = "qna_data/vi_{}.json".format(dataset_type) # Init features columns if self.for_train: features_columns = { "id": [], "question": [], "text": [], "label": [], "pid": [], } json_samples = read_json_data(data_file) for json_sample in json_samples: if self.for_train: features_columns["id"].append(json_sample["id"]) features_columns["label"].append(1 if json_sample["label"] else 0) features_columns["pid"].append(json_sample["pid"]) for key in ["question", "text"]: pre_key = "{}_{}_{}".format( method, cased, key ) pre_text, tokens = self.pre_process_text( json_sample[key], method, cased, self.for_train, key ) json_sample[pre_key] = pre_text if self.for_train: features_columns[key].append(tokens) # samples with preprocessed keys write_json_data(data_file, json_samples) print ("{}. Length {}. Done write to file {}".format( dataset_type, len(json_samples), data_file )) # save for writing later when we have vocab if self.for_train: dataset_features_columns[dataset_type] = features_columns # build vocab vocab_file = "{}/vocab.json".format(folder_path, dataset_type) if self.build_vocab: self._build_vocab(vocab_file, method, cased) else: self.vocab = VocabEntry.from_json(vocab_file) # write configs configs = { "vocab_size": len(self.vocab), "question_size": self.question_size, "text_size": self.text_size, } configs_file = "{}/configs.json".format(folder_path) write_json_data(configs_file, configs) print ("Done wirte config file {}".format(configs_file)) # write features columns # generate featured dataset if self.for_train: for dataset_type, features_columns in dataset_features_columns.items(): self.write_features_columns( features_columns, folder_name, dataset_type )