def __init__(self): # Don't use the common init for the moment # common_init(self) self.args = self.init_args() if self.args.continue_train and self.args.model_dir is None: raise Exception("'--model-dir' must be specified when using " "'--continue-train'") prepare_dir(self.args) self.logger = get_logger(self.args) set_utils_logger(self.logger) np.random.seed(self.args.seed) random.seed(self.args.seed) torch.manual_seed(self.args.seed) init_device(self.args) save_args(self.args) save_commit_id(self.args) self.tb = TensorBoard(self.args.model_dir)
data = json.load(f) for idx, ex in enumerate(data): case_id = ex["id"] ex_sentences = ex["tagged_sentences"] for sentence in ex_sentences: sentences.append(sentence) case_ids.append(case_id) df = {"ID": case_ids, "sentences": sentences} df = pd.DataFrame(df) df.to_excel(to_fn) if __name__ == '__main__': to_folder = os.path.join("./", "temp") prepare_dir(to_folder) fns = { "input": os.path.join("./", "run", "law_ner_tag.json"), "output": { "classification": os.path.join(to_folder, "class_sentences.xlsx"), "ner": os.path.join(to_folder, "ner_sentences.xlsx") } # "output": os.path.join(to_folder, "sentences.xlsx") # "output" : os.path.join(to_folder, "ner_sentences.xlsx") } build_data(fns, "classification") # for 관계 태깅 build_data(fns, "ner") # for ner 태깅
def common_init(that): """Common initialization of our models. Here is the check list: - [√] Parse the input arguments - [√] Create necessary folders to save data - [√] Set a logger to be used and save the output - [√] Set manual seeds to make results reproductible - [√] Init the correct device to be used by pytorch: cpu or cuda:id - [√] Save the input arguments used - [√] Save the git infos: commit id, repo origin - [√] Set a tensorboard object to record stats - [√] Set a DataSelector object which handles data samples - [√] Set a StatKeeper object which can save arbitrary stats - [√] Perform specific initializations based on input params """ that.args = that.init_args() if that.args.continue_train and that.args.model_dir is None: raise Exception("'--model-dir' must be specified when using " "'--continue-train'") prepare_dir(that.args) that.logger = get_logger(that.args) set_utils_logger(that.logger) np.random.seed(that.args.seed) random.seed(that.args.seed) torch.manual_seed(that.args.seed) init_device(that.args) save_args(that.args) save_commit_id(that.args) that.tb = TensorBoard(that.args.model_dir) that.ds = DataSelector(that.args) that.sk = StatsKeeper(that.args, that.args.stat_folder) # Init seq if that.args.init_seq == "original": # Done by default in DataSelector initialization pass elif that.args.init_seq.startswith("overlap_"): overlap = int(that.args.init_seq.split("_")[1]) if that.args.bptt % overlap != 0: raise Exception(f"overlap must divide '--bptt' (found {overlap})") that.ds.current_seq = that.ds.overlap_seq( that.args.batch_size, overlap) elif that.args.init_seq.startswith("overlapC_"): overlap = int(that.args.init_seq.split("_")[1]) if that.args.bptt % overlap != 0: raise Exception(f"overlapC must divide '--bptt' (found {overlap})") that.ds.current_seq = that.ds.overlap_c_seq( that.args.batch_size, overlap) elif that.args.init_seq.startswith("overlapCN_"): overlap = int(that.args.init_seq.split("_")[1]) if that.args.bptt % overlap != 0: raise Exception( f"overlapCN must divide '--bptt' (found {overlap})") that.ds.current_seq = that.ds.overlap_cn_seq( that.args.batch_size, overlap) elif that.args.init_seq.startswith("overlapCNX_"): overlap = int(that.args.init_seq.split("_")[1]) if that.args.bptt % overlap != 0: raise Exception( f"overlapCNX must divide '--bptt' (found {overlap})") that.ds.current_seq = that.ds.overlap_cnx_seq( that.args.batch_size, overlap) elif that.args.init_seq.startswith("overlapCX_"): overlap = int(that.args.init_seq.split("_")[1]) if that.args.bptt % overlap != 0: raise Exception( f"overlapCX must divide '--bptt' (found {overlap})") that.ds.current_seq = that.ds.overlap_cx_seq( that.args.batch_size, overlap) elif that.args.init_seq.startswith("overlapCNF_"): overlap = int(that.args.init_seq.split("_")[1]) if overlap > that.args.bptt: raise Exception( "overlapCNF must be lower than '--bptt' (found {overlap})") that.ds.current_seq = that.ds.overlap_cnf_seq( that.args.batch_size, overlap) else: raise Exception(f"init-seq unkown: {that.args.init_seq}") # Type of train_seq if that.args.train_seq == "original": that.train_seq = that.ds.train_seq elif that.args.train_seq.startswith("repeat_"): n = int(that.args.train_seq.split("_")[1]) that.train_seq = lambda: that.ds.repeated_train_seq(n) else: raise Exception(f"train-seq unkown: {that.args.train_seq}") # Shuffling of the train_seq if that.args.shuffle_row_seq: that.ds.shuffle_row_train_seq() if that.args.shuffle_col_seq: that.ds.shuffle_col_train_seq() if that.args.shuffle_each_row_seq: that.ds.shuffle_each_row_train_seq() if that.args.shuffle_full_seq: that.ds.shuffle_full_train_seq()