Beispiel #1
0
    def __init__(self):
        # Don't use the common init for the moment
        # common_init(self)
        self.args = self.init_args()

        if self.args.continue_train and self.args.model_dir is None:
            raise Exception("'--model-dir' must be specified when using "
                            "'--continue-train'")

        prepare_dir(self.args)
        self.logger = get_logger(self.args)
        set_utils_logger(self.logger)
        np.random.seed(self.args.seed)
        random.seed(self.args.seed)
        torch.manual_seed(self.args.seed)
        init_device(self.args)
        save_args(self.args)
        save_commit_id(self.args)
        self.tb = TensorBoard(self.args.model_dir)
Beispiel #2
0
        data = json.load(f)

        for idx, ex in enumerate(data):
            case_id = ex["id"]
            ex_sentences = ex["tagged_sentences"]

            for sentence in ex_sentences:
                sentences.append(sentence)
                case_ids.append(case_id)

    df = {"ID": case_ids, "sentences": sentences}
    df = pd.DataFrame(df)
    df.to_excel(to_fn)


if __name__ == '__main__':
    to_folder = os.path.join("./", "temp")
    prepare_dir(to_folder)

    fns = {
        "input": os.path.join("./", "run", "law_ner_tag.json"),
        "output": {
            "classification": os.path.join(to_folder, "class_sentences.xlsx"),
            "ner": os.path.join(to_folder, "ner_sentences.xlsx")
        }
        # "output": os.path.join(to_folder, "sentences.xlsx")
        # "output" : os.path.join(to_folder, "ner_sentences.xlsx")
    }

    build_data(fns, "classification")  # for 관계 태깅
    build_data(fns, "ner")  # for ner 태깅
Beispiel #3
0
def common_init(that):
    """Common initialization of our models. Here is the check list:

        - [√] Parse the input arguments
        - [√] Create necessary folders to save data
        - [√] Set a logger to be used and save the output 
        - [√] Set manual seeds to make results reproductible
        - [√] Init the correct device to be used by pytorch: cpu or cuda:id
        - [√] Save the input arguments used
        - [√] Save the git infos: commit id, repo origin
        - [√] Set a tensorboard object to record stats
        - [√] Set a DataSelector object which handles data samples
        - [√] Set a StatKeeper object which can save arbitrary stats
        - [√] Perform specific initializations based on input params
    """
    that.args = that.init_args()

    if that.args.continue_train and that.args.model_dir is None:
        raise Exception("'--model-dir' must be specified when using "
                        "'--continue-train'")

    prepare_dir(that.args)
    that.logger = get_logger(that.args)
    set_utils_logger(that.logger)
    np.random.seed(that.args.seed)
    random.seed(that.args.seed)
    torch.manual_seed(that.args.seed)
    init_device(that.args)
    save_args(that.args)
    save_commit_id(that.args)
    that.tb = TensorBoard(that.args.model_dir)
    that.ds = DataSelector(that.args)
    that.sk = StatsKeeper(that.args, that.args.stat_folder)

    # Init seq
    if that.args.init_seq == "original":
        # Done by default in DataSelector initialization
        pass
    elif that.args.init_seq.startswith("overlap_"):
        overlap = int(that.args.init_seq.split("_")[1])
        if that.args.bptt % overlap != 0:
            raise Exception(f"overlap must divide '--bptt' (found {overlap})")
        that.ds.current_seq = that.ds.overlap_seq(
            that.args.batch_size, overlap)
    elif that.args.init_seq.startswith("overlapC_"):
        overlap = int(that.args.init_seq.split("_")[1])
        if that.args.bptt % overlap != 0:
            raise Exception(f"overlapC must divide '--bptt' (found {overlap})")
        that.ds.current_seq = that.ds.overlap_c_seq(
            that.args.batch_size, overlap)
    elif that.args.init_seq.startswith("overlapCN_"):
        overlap = int(that.args.init_seq.split("_")[1])
        if that.args.bptt % overlap != 0:
            raise Exception(
                f"overlapCN must divide '--bptt' (found {overlap})")
        that.ds.current_seq = that.ds.overlap_cn_seq(
            that.args.batch_size, overlap)
    elif that.args.init_seq.startswith("overlapCNX_"):
        overlap = int(that.args.init_seq.split("_")[1])
        if that.args.bptt % overlap != 0:
            raise Exception(
                f"overlapCNX must divide '--bptt' (found {overlap})")
        that.ds.current_seq = that.ds.overlap_cnx_seq(
            that.args.batch_size, overlap)
    elif that.args.init_seq.startswith("overlapCX_"):
        overlap = int(that.args.init_seq.split("_")[1])
        if that.args.bptt % overlap != 0:
            raise Exception(
                f"overlapCX must divide '--bptt' (found {overlap})")
        that.ds.current_seq = that.ds.overlap_cx_seq(
            that.args.batch_size, overlap)
    elif that.args.init_seq.startswith("overlapCNF_"):
        overlap = int(that.args.init_seq.split("_")[1])
        if overlap > that.args.bptt:
            raise Exception(
                "overlapCNF must be lower than '--bptt' (found {overlap})")
        that.ds.current_seq = that.ds.overlap_cnf_seq(
            that.args.batch_size, overlap)
    else:
        raise Exception(f"init-seq unkown: {that.args.init_seq}")

    # Type of train_seq
    if that.args.train_seq == "original":
        that.train_seq = that.ds.train_seq
    elif that.args.train_seq.startswith("repeat_"):
        n = int(that.args.train_seq.split("_")[1])
        that.train_seq = lambda: that.ds.repeated_train_seq(n)
    else:
        raise Exception(f"train-seq unkown: {that.args.train_seq}")

    # Shuffling of the train_seq
    if that.args.shuffle_row_seq:
        that.ds.shuffle_row_train_seq()
    if that.args.shuffle_col_seq:
        that.ds.shuffle_col_train_seq()
    if that.args.shuffle_each_row_seq:
        that.ds.shuffle_each_row_train_seq()
    if that.args.shuffle_full_seq:
        that.ds.shuffle_full_train_seq()