コード例 #1
0
    def back_up(self, conf, problem):
        cache_bakup_path = os.path.join(conf.save_base_dir, 'necessary_cache/')
        logging.debug('Prepare dir: %s' % cache_bakup_path)
        prepare_dir(cache_bakup_path, True, allow_overwrite=True, clear_dir_if_exist=True)

        problem.export_problem(cache_bakup_path+'problem.pkl')
        logging.debug("Problem %s is backed up to %s" % (conf.problem_path, cache_bakup_path))
コード例 #2
0
        def configurate_logger(self):
            if self.phase == 'cache':
                return

            # dir
            if hasattr(self.params, 'log_dir') and self.params.log_dir:
                self.log_dir = self.params.log_dir
                prepare_dir(self.log_dir, True, allow_overwrite=True)
            else:
                self.log_dir = self.save_base_dir

            # path
            self.train_log_path = os.path.join(self.log_dir,
                                               self.train_log_name)
            self.test_log_path = os.path.join(self.log_dir, self.test_log_name)
            self.predict_log_path = os.path.join(self.log_dir,
                                                 self.predict_log_name)
            if self.phase == 'train':
                log_path = self.train_log_path
            elif self.phase == 'test':
                log_path = self.test_log_path
            elif self.phase == 'predict':
                log_path = self.predict_log_path
            if log_path is None:
                self.raise_configuration_error(self.phase + '_log_name')

            # log level
            if self.mode == 'philly' or self.params.debug:
                log_set(log_path,
                        console_level='DEBUG',
                        console_detailed=True,
                        disable_log_file=self.params.disable_log_file)
            else:
                log_set(log_path,
                        disable_log_file=self.params.disable_log_file)
コード例 #3
0
    def _prepare_encoding_cache(self, conf, problem, build=False):
        # encoding cache dir
        problem_path = conf.problem_path if not conf.pretrained_model_path else conf.saved_problem_path
        conf.problem_md5 = md5([problem_path])
        conf.encoding_cache_dir = os.path.join(
            conf.cache_dir, conf.train_data_md5 + conf.problem_md5)
        if not os.path.exists(conf.encoding_cache_dir):
            os.makedirs(conf.encoding_cache_dir)

        # encoding cache files
        conf.encoding_cache_index_file_path = os.path.join(
            conf.encoding_cache_dir, st.cencodig_index_file_name)
        conf.encoding_cache_index_file_md5_path = os.path.join(
            conf.encoding_cache_dir, st.cencoding_index_md5_file_name)
        conf.load_encoding_cache_generator = self._load_encoding_cache_generator

        if build:
            prepare_dir(conf.encoding_cache_dir,
                        True,
                        allow_overwrite=True,
                        clear_dir_if_exist=True)
            problem.build_encode_cache(conf)
            self.encoding_invalid = False

        if not self.encoding_invalid:
            cache_index = load_from_json(conf.encoding_cache_index_file_path)
            conf.encoding_file_index = cache_index[st.cencoding_key_index]
コード例 #4
0
ファイル: ModelConf.py プロジェクト: zeta1999/NeuronBlocks
    def configurate_outputs(self):
        def configurate_logger(self):
            if self.phase == 'cache':
                return

            # dir
            if hasattr(self.params, 'log_dir') and self.params.log_dir:
                self.log_dir = self.params.log_dir
                prepare_dir(self.log_dir, True, allow_overwrite=True)
            else:
                self.log_dir = self.save_base_dir
            
            # path
            self.train_log_path = os.path.join(self.log_dir, self.train_log_name)
            self.test_log_path = os.path.join(self.log_dir, self.test_log_name)
            self.predict_log_path = os.path.join(self.log_dir, self.predict_log_name)
            if self.phase == 'train':
                log_path = self.train_log_path
            elif self.phase == 'test':
                log_path = self.test_log_path
            elif self.phase == 'predict':
                log_path =  self.predict_log_path
            if log_path is None:
                self.raise_configuration_error(self.phase + '_log_name')

            # log level
            if self.mode == 'philly' or self.params.debug:
                log_set(log_path, console_level='DEBUG', console_detailed=True, disable_log_file=self.params.disable_log_file)
            else:
                log_set(log_path, disable_log_file=self.params.disable_log_file)

        # save base dir
        if hasattr(self.params, 'model_save_dir') and self.params.model_save_dir:
            self.save_base_dir = self.params.model_save_dir
        elif self.save_base_dir is None:
            self.raise_configuration_error('save_base_dir')

        # prepare save base dir 
        if self.phase != 'cache':
            prepare_dir(self.save_base_dir, True, allow_overwrite=self.params.force or self.mode == 'philly',
                        extra_info='will overwrite model file and train.log' if self.phase=='train' else 'will add %s.log and predict file'%self.phase)

        # logger
        configurate_logger(self)

        # predict output path
        if self.phase != 'cache':
            if self.params.predict_output_path:
                self.predict_output_path = self.params.predict_output_path
            else:
                self.predict_output_path = os.path.join(self.save_base_dir, self.predict_output_name)
            logging.debug('Prepare dir for: %s' % self.predict_output_path)
            prepare_dir(self.predict_output_path, False, allow_overwrite=self.params.force or self.mode == 'philly')

        if self.predict_fields is None:
            self.predict_fields = DefaultPredictionFields[ProblemTypes[self.problem_type]]

        self.model_save_path = os.path.join(self.save_base_dir, self.model_name)
コード例 #5
0
ファイル: train.py プロジェクト: ljshou/NeuronBlocks-1
def main(params):
    conf = ModelConf("train", params.conf_path, version, params, mode=params.mode)

    shutil.copy(params.conf_path, conf.save_base_dir)
    logging.info('Configuration file is backed up to %s' % (conf.save_base_dir))

    if ProblemTypes[conf.problem_type] == ProblemTypes.sequence_tagging:
        problem = Problem(conf.problem_type, conf.input_types, conf.answer_column_name,
            source_with_start=True, source_with_end=True, source_with_unk=True, source_with_pad=True,
            target_with_start=True, target_with_end=True, target_with_unk=True, target_with_pad=True, same_length=True,
            with_bos_eos=conf.add_start_end_for_seq, tagging_scheme=conf.tagging_scheme,
            remove_stopwords=conf.remove_stopwords, DBC2SBC=conf.DBC2SBC, unicode_fix=conf.unicode_fix)
    elif ProblemTypes[conf.problem_type] == ProblemTypes.classification \
            or ProblemTypes[conf.problem_type] == ProblemTypes.regression:
        problem = Problem(conf.problem_type, conf.input_types, conf.answer_column_name,
            source_with_start=True, source_with_end=True, source_with_unk=True, source_with_pad=True,
            target_with_start=False, target_with_end=False, target_with_unk=False, target_with_pad=False,
            same_length=False, with_bos_eos=conf.add_start_end_for_seq, remove_stopwords=conf.remove_stopwords,
            DBC2SBC=conf.DBC2SBC, unicode_fix=conf.unicode_fix)
    elif ProblemTypes[conf.problem_type] == ProblemTypes.mrc:
        problem = Problem(conf.problem_type, conf.input_types, conf.answer_column_name,
                          source_with_start=True, source_with_end=True, source_with_unk=True, source_with_pad=True,
                          target_with_start=False, target_with_end=False, target_with_unk=False, target_with_pad=False,
                          same_length=False, with_bos_eos=False, remove_stopwords=conf.remove_stopwords,
                          DBC2SBC=conf.DBC2SBC, unicode_fix=conf.unicode_fix)

    cache_load_flag = False
    if not conf.pretrained_model_path:
        # first time training, load cache if appliable
        if conf.use_cache:
            cache_conf_path = os.path.join(conf.cache_dir, 'conf_cache.json')
            if os.path.isfile(cache_conf_path):
                params_cache = copy.deepcopy(params)
                '''
                for key in vars(params_cache):
                    setattr(params_cache, key, None)
                params_cache.mode = params.mode
                '''
                try:
                    cache_conf = ModelConf('cache', cache_conf_path, version, params_cache)
                except Exception as e:
                    cache_conf = None
                if cache_conf is None or verify_cache(cache_conf, conf) is not True:
                    logging.info('Found cache that is ineffective')
                    if params.mode == 'philly' or params.force is True:
                        renew_option = 'yes'
                    else:
                        renew_option = input('There exists ineffective cache %s for old models. Input "yes" to renew cache and "no" to exit. (default:no): ' % os.path.abspath(conf.cache_dir))
                    if renew_option.lower() != 'yes':
                        exit(0)
                    else:
                        shutil.rmtree(conf.cache_dir)
                        time.sleep(2)  # sleep 2 seconds since the deleting is asynchronous
                        logging.info('Old cache is deleted')
                else:
                    logging.info('Found cache that is appliable to current configuration...')

            elif os.path.isdir(conf.cache_dir):
                renew_option = input('There exists ineffective cache %s for old models. Input "yes" to renew cache and "no" to exit. (default:no): ' % os.path.abspath(conf.cache_dir))
                if renew_option.lower() != 'yes':
                    exit(0)
                else:
                    shutil.rmtree(conf.cache_dir)
                    time.sleep(2)  # Sleep 2 seconds since the deleting is asynchronous
                    logging.info('Old cache is deleted')

            if not os.path.exists(conf.cache_dir):
                os.makedirs(conf.cache_dir)
                shutil.copy(params.conf_path, os.path.join(conf.cache_dir, 'conf_cache.json'))

        # first time training, load problem from cache, and then backup the cache to model_save_dir/.necessary_cache/
        if conf.use_cache and os.path.isfile(conf.problem_path):
            problem.load_problem(conf.problem_path)
            if conf.emb_pkl_path is not None:
                if os.path.isfile(conf.emb_pkl_path):
                    emb_matrix = np.array(load_from_pkl(conf.emb_pkl_path))
                    cache_load_flag = True
                else:
                    if params.mode == 'normal':
                        renew_option = input('The cache is invalid because the embedding matrix does not exist in the cache directory. Input "yes" to renew cache and "no" to exit. (default:no): ')
                        if renew_option.lower() != 'yes':
                            exit(0)
                    else:
                        # by default, renew cache
                        renew_option = 'yes'
            else:
                emb_matrix = None
                cache_load_flag = True
            if cache_load_flag:
                logging.info("Cache loaded!")

        if cache_load_flag is False:
            logging.info("Preprocessing... Depending on your corpus size, this step may take a while.")
            if conf.pretrained_emb_path:
                emb_matrix = problem.build(conf.train_data_path, conf.file_columns, conf.input_types, conf.file_with_col_header,
                                           conf.answer_column_name, word2vec_path=conf.pretrained_emb_path,
                                           word_emb_dim=conf.pretrained_emb_dim, format=conf.pretrained_emb_type,
                                           file_type=conf.pretrained_emb_binary_or_text, involve_all_words=conf.involve_all_words_in_pretrained_emb,
                                           show_progress=True if params.mode == 'normal' else False, max_vocabulary=conf.max_vocabulary,
                                           word_frequency=conf.min_word_frequency)
            else:
                emb_matrix = problem.build(conf.train_data_path, conf.file_columns, conf.input_types, conf.file_with_col_header,
                                           conf.answer_column_name, word2vec_path=None, word_emb_dim=None, format=None,
                                           file_type=None, involve_all_words=conf.involve_all_words_in_pretrained_emb,
                                           show_progress=True if params.mode == 'normal' else False,
                                           max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency)

            if conf.mode == 'philly' and conf.emb_pkl_path.startswith('/hdfs/'):
                with HDFSDirectTransferer(conf.problem_path, with_hdfs_command=True) as transferer:
                    transferer.pkl_dump(problem.export_problem(conf.problem_path, ret_without_save=True))
            else:
                problem.export_problem(conf.problem_path)
            if conf.use_cache:
                logging.info("Cache saved to %s" % conf.problem_path)
                if emb_matrix is not None and conf.emb_pkl_path is not None:
                    if conf.mode == 'philly' and conf.emb_pkl_path.startswith('/hdfs/'):
                        with HDFSDirectTransferer(conf.emb_pkl_path, with_hdfs_command=True) as transferer:
                            transferer.pkl_dump(emb_matrix)
                    else:
                        dump_to_pkl(emb_matrix, conf.emb_pkl_path)
                    logging.info("Embedding matrix saved to %s" % conf.emb_pkl_path)
            else:
                logging.debug("Cache saved to %s" % conf.problem_path)

        # Back up the problem.pkl to save_base_dir/.necessary_cache. During test phase, we would load cache from save_base_dir/.necessary_cache/problem.pkl
        cache_bakup_path = os.path.join(conf.save_base_dir, 'necessary_cache/')
        logging.debug('Prepare dir: %s' % cache_bakup_path)
        prepare_dir(cache_bakup_path, True, allow_overwrite=True, clear_dir_if_exist=True)

        shutil.copy(conf.problem_path, cache_bakup_path)
        logging.debug("Problem %s is backed up to %s" % (conf.problem_path, cache_bakup_path))
        if problem.output_dict:
            logging.debug("Problem target cell dict: %s" % (problem.output_dict.cell_id_map))

        if params.make_cache_only:
            logging.info("Finish building cache!")
            return

        vocab_info = dict() # include input_type's vocab_size & init_emd_matrix
        vocab_sizes = problem.get_vocab_sizes()
        for input_cluster in vocab_sizes:
            vocab_info[input_cluster] = dict()
            vocab_info[input_cluster]['vocab_size'] = vocab_sizes[input_cluster]
            # add extra info for char_emb
            if input_cluster.lower() == 'char':
                for key, value in conf.input_types[input_cluster].items():
                    if key != 'cols':
                        vocab_info[input_cluster][key] = value
            if input_cluster == 'word' and emb_matrix is not None:
                vocab_info[input_cluster]['init_weights'] = emb_matrix
            else:
                vocab_info[input_cluster]['init_weights'] = None

        lm = LearningMachine('train', conf, problem, vocab_info=vocab_info, initialize=True, use_gpu=conf.use_gpu)
    else:
        # when finetuning, load previous saved problem
        problem.load_problem(conf.saved_problem_path)
        lm = LearningMachine('train', conf, problem, vocab_info=None, initialize=False, use_gpu=conf.use_gpu)

    if len(conf.metrics_post_check) > 0:
        for metric_to_chk in conf.metrics_post_check:
            metric, target = metric_to_chk.split('@')
            if not problem.output_dict.has_cell(target):
                raise Exception("The target %s of %s does not exist in the training data." % (target, metric_to_chk))

    if conf.pretrained_model_path:
        logging.info('Loading the pretrained model: %s...' % conf.pretrained_model_path)
        lm.load_model(conf.pretrained_model_path)

    loss_conf = conf.loss
    loss_conf['output_layer_id'] = conf.output_layer_id
    loss_conf['answer_column_name'] = conf.answer_column_name
    # loss_fn = eval(loss_conf['type'])(**loss_conf['conf'])
    loss_fn = Loss(**loss_conf)
    if conf.use_gpu is True:
        loss_fn.cuda()

    optimizer = eval(conf.optimizer_name)(lm.model.parameters(), **conf.optimizer_params)

    lm.train(optimizer, loss_fn)

    # test the best model with the best model saved
    lm.load_model(conf.model_save_path)
    if conf.test_data_path is not None:
        test_path = conf.test_data_path
    elif conf.valid_data_path is not None:
        test_path = conf.valid_data_path
    logging.info('Testing the best model saved at %s, with %s' % (conf.model_save_path, test_path))
    if not test_path.endswith('pkl'):
        lm.test(loss_fn, test_path, predict_output_path=conf.predict_output_path)
    else:
        lm.test(loss_fn, test_path)
コード例 #6
0
    def load_from_file(self, conf_path):
        with codecs.open(conf_path, 'r', encoding='utf-8') as fin:
            try:
                self.conf = json.load(fin)
            except Exception as e:
                raise ConfigurationError(
                    "%s is not a legal JSON file, please check your JSON format!"
                    % conf_path)

        self.tool_version = self.get_item(['tool_version'])
        self.language = self.get_item(['language'], default='english').lower()
        self.problem_type = self.get_item(['inputs', 'dataset_type']).lower()
        #if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
        self.tagging_scheme = self.get_item(['inputs', 'tagging_scheme'],
                                            default=None,
                                            use_default=True)

        if self.mode == 'normal':
            self.use_cache = self.get_item(['inputs', 'use_cache'], True)
        elif self.mode == 'philly':
            self.use_cache = True

        # OUTPUTS
        if hasattr(self.params,
                   'model_save_dir') and self.params.model_save_dir:
            self.save_base_dir = self.params.model_save_dir
        else:
            self.save_base_dir = self.get_item(['outputs', 'save_base_dir'])

        if self.phase == 'train':
            # in train.py, it is called pretrained_model_path
            if hasattr(self.params, 'pretrained_model_path'
                       ) and self.params.pretrained_model_path:
                self.pretrained_model_path = self.previous_model_path = self.params.pretrained_model_path
            else:
                self.pretrained_model_path = self.previous_model_path = self.get_item(
                    ['inputs', 'data_paths', 'pretrained_model_path'],
                    default=None,
                    use_default=True)
        elif self.phase == 'test' or self.phase == 'predict':
            # in test.py and predict.py, it is called pretrained_model_path
            if hasattr(
                    self.params,
                    'previous_model_path') and self.params.previous_model_path:
                self.previous_model_path = self.pretrained_model_path = self.params.previous_model_path
            else:
                self.previous_model_path = self.pretrained_model_path = os.path.join(
                    self.save_base_dir,
                    self.get_item(['outputs', 'model_name'
                                   ]))  # namely, the model_save_path

        if hasattr(
                self, 'pretrained_model_path'
        ) and self.pretrained_model_path:  # namely self.previous_model_path
            tmp_saved_problem_path = os.path.join(
                os.path.dirname(self.pretrained_model_path),
                '.necessary_cache', 'problem.pkl')
            self.saved_problem_path = tmp_saved_problem_path if os.path.isfile(tmp_saved_problem_path) \
                else os.path.join(os.path.dirname(self.pretrained_model_path), 'necessary_cache', 'problem.pkl')
            if not (os.path.isfile(self.pretrained_model_path)
                    and os.path.isfile(self.saved_problem_path)):
                raise Exception(
                    'Previous trained model %s or its dictionaries %s does not exist!'
                    % (self.pretrained_model_path, self.saved_problem_path))

        if self.phase != 'cache':
            prepare_dir(
                self.save_base_dir,
                True,
                allow_overwrite=self.params.force or self.mode == 'philly',
                extra_info='will overwrite model file and train.log' if
                self.phase == 'train' else 'will add %s.log and predict file' %
                self.phase)

        if hasattr(self.params, 'log_dir') and self.params.log_dir:
            self.log_dir = self.params.log_dir
            if self.phase != 'cache':
                prepare_dir(self.log_dir, True, allow_overwrite=True)
        else:
            self.log_dir = self.save_base_dir

        if self.phase == 'train':
            self.train_log_path = os.path.join(
                self.log_dir, self.get_item(['outputs', 'train_log_name']))
            if self.mode == 'philly' or self.params.debug:
                log_set(self.train_log_path,
                        console_level='DEBUG',
                        console_detailed=True,
                        disable_log_file=self.params.disable_log_file)
            else:
                log_set(self.train_log_path,
                        disable_log_file=self.params.disable_log_file)
        elif self.phase == 'test':
            self.test_log_path = os.path.join(
                self.log_dir, self.get_item(['outputs', 'test_log_name']))
            if self.mode == 'philly' or self.params.debug:
                log_set(self.test_log_path,
                        console_level='DEBUG',
                        console_detailed=True,
                        disable_log_file=self.params.disable_log_file)
            else:
                log_set(self.test_log_path,
                        disable_log_file=self.params.disable_log_file)
        elif self.phase == 'predict':
            self.predict_log_path = os.path.join(
                self.log_dir, self.get_item(['outputs', 'predict_log_name']))
            if self.mode == 'philly' or self.params.debug:
                log_set(self.predict_log_path,
                        console_level='DEBUG',
                        console_detailed=True,
                        disable_log_file=self.params.disable_log_file)
            else:
                log_set(self.predict_log_path,
                        disable_log_file=self.params.disable_log_file)
        if self.phase != 'cache':
            self.predict_output_path = self.params.predict_output_path if self.params.predict_output_path else os.path.join(
                self.save_base_dir,
                self.get_item(['outputs', 'predict_output_name'],
                              default='predict.tsv'))
            logging.debug('Prepare dir for: %s' % self.predict_output_path)
            prepare_dir(self.predict_output_path,
                        False,
                        allow_overwrite=self.params.force
                        or self.mode == 'philly')
        self.predict_fields = self.get_item(
            ['outputs', 'predict_fields'],
            default=DefaultPredictionFields[ProblemTypes[self.problem_type]])

        self.model_save_path = os.path.join(
            self.save_base_dir, self.get_item(['outputs', 'model_name']))

        # INPUTS
        if hasattr(self.params,
                   'train_data_path') and self.params.train_data_path:
            self.train_data_path = self.params.train_data_path
        else:
            if self.mode == 'normal':
                self.train_data_path = self.get_item(
                    ['inputs', 'data_paths', 'train_data_path'],
                    default=None,
                    use_default=True)
            else:
                self.train_data_path = None
        if hasattr(self.params,
                   'valid_data_path') and self.params.valid_data_path:
            self.valid_data_path = self.params.valid_data_path
        else:
            if self.mode == 'normal':
                self.valid_data_path = self.get_item(
                    ['inputs', 'data_paths', 'valid_data_path'],
                    default=None,
                    use_default=True)
            else:
                self.valid_data_path = None
        if hasattr(self.params,
                   'test_data_path') and self.params.test_data_path:
            self.test_data_path = self.params.test_data_path
        else:
            if self.mode == 'normal':
                self.test_data_path = self.get_item(
                    ['inputs', 'data_paths', 'test_data_path'],
                    default=None,
                    use_default=True)
            else:
                self.test_data_path = None

        if self.phase == 'predict':
            if self.params.predict_data_path:
                self.predict_data_path = self.params.predict_data_path
            else:
                if self.mode == 'normal':
                    self.predict_data_path = self.get_item(
                        ['inputs', 'data_paths', 'predict_data_path'],
                        default=None,
                        use_default=True)
                else:
                    self.predict_data_path = None

        if self.phase == 'train' or self.phase == 'cache':
            if self.valid_data_path is None and self.test_data_path is not None:
                # We support test_data_path == None, if someone set valid_data_path to None while test_data_path is not None,
                # swap the valid_data_path and test_data_path
                self.valid_data_path = self.test_data_path
                self.test_data_path = None
        elif self.phase == 'predict':
            if self.predict_data_path is None and self.test_data_path is not None:
                self.predict_data_path = self.test_data_path
                self.test_data_path = None

        if self.phase == 'train' or self.phase == 'test' or self.phase == 'cache':
            self.file_columns = self.get_item(['inputs', 'file_header'])
        else:
            self.file_columns = self.get_item(['inputs', 'file_header'],
                                              default=None,
                                              use_default=True)

        if self.phase == 'predict':
            if self.file_columns is None:
                self.predict_file_columns = self.get_item(
                    ['inputs', 'predict_file_header'])
            else:
                self.predict_file_columns = self.get_item(
                    ['inputs', 'predict_file_header'],
                    default=None,
                    use_default=True)
                if self.predict_file_columns is None:
                    self.predict_file_columns = self.file_columns

        if self.phase != 'predict':
            if self.phase == 'cache':
                self.answer_column_name = self.get_item(['inputs', 'target'],
                                                        default=None,
                                                        use_default=True)
            else:
                self.answer_column_name = self.get_item(['inputs', 'target'])
        self.input_types = self.get_item(['architecture', 0, 'conf'])
        # add extra feature
        feature_all = set([_.lower() for _ in self.input_types.keys()])
        formal_feature = set(['word', 'char'])
        self.extra_feature = len(feature_all - formal_feature) != 0

        # add char embedding config
        # char_emb_type = None
        # char_emb_type_cols = None
        # for single_type in self.input_types:
        #     if single_type.lower() == 'char':
        #         char_emb_type = single_type
        #         char_emb_type_cols = [single_col.lower() for single_col in self.input_types[single_type]['cols']]
        #         break
        self.object_inputs = self.get_item(['inputs', 'model_inputs'])
        # if char_emb_type and char_emb_type_cols:
        #     for single_input in self.object_inputs:
        #         for single_col in char_emb_type_cols:
        #             if single_input.lower() in single_col:
        #                 self.object_inputs[single_input].append(single_col)

        self.object_inputs_names = [name for name in self.object_inputs]

        # vocabulary setting
        self.max_vocabulary = self.get_item(
            ['training_params', 'vocabulary', 'max_vocabulary'],
            default=800000,
            use_default=True)
        self.min_word_frequency = self.get_item(
            ['training_params', 'vocabulary', 'min_word_frequency'],
            default=3,
            use_default=True)

        # file column header setting
        self.file_with_col_header = self.get_item(
            ['inputs', 'file_with_col_header'],
            default=False,
            use_default=True)

        if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
            self.add_start_end_for_seq = self.get_item(
                ['inputs', 'add_start_end_for_seq'], default=True)
        else:
            self.add_start_end_for_seq = self.get_item(
                ['inputs', 'add_start_end_for_seq'], default=False)

        if hasattr(self.params,
                   'pretrained_emb_path') and self.params.pretrained_emb_path:
            self.pretrained_emb_path = self.params.pretrained_emb_path
        else:
            if self.mode == 'normal':
                self.pretrained_emb_path = self.get_item(
                    ['inputs', 'data_paths', 'pre_trained_emb'],
                    default=None,
                    use_default=True)
            else:
                self.pretrained_emb_path = None

        if 'word' in self.get_item(['architecture', 0, 'conf'
                                    ]) and self.pretrained_emb_path:
            if hasattr(self.params, 'involve_all_words_in_pretrained_emb'
                       ) and self.params.involve_all_words_in_pretrained_emb:
                self.involve_all_words_in_pretrained_emb = self.params.involve_all_words_in_pretrained_emb
            else:
                self.involve_all_words_in_pretrained_emb = self.get_item(
                    ['inputs', 'involve_all_words_in_pretrained_emb'],
                    default=False)
            if hasattr(
                    self.params,
                    'pretrained_emb_type') and self.params.pretrained_emb_type:
                self.pretrained_emb_type = self.params.pretrained_emb_type
            else:
                self.pretrained_emb_type = self.get_item(
                    ['inputs', 'pretrained_emb_type'], default='glove')
            if hasattr(self.params, 'pretrained_emb_binary_or_text'
                       ) and self.params.pretrained_emb_binary_or_text:
                self.pretrained_emb_binary_or_text = self.params.pretrained_emb_binary_or_text
            else:
                self.pretrained_emb_binary_or_text = self.get_item(
                    ['inputs', 'pretrained_emb_binary_or_text'],
                    default='text')
            self.pretrained_emb_dim = self.get_item(
                ['architecture', 0, 'conf', 'word', 'dim'])
        else:
            self.pretrained_emb_path = None
            self.involve_all_words_in_pretrained_emb = None
            self.pretrained_emb_binary_or_text = None
            self.pretrained_emb_dim = None
            self.pretrained_emb_type = None

        if self.phase == 'train':
            if hasattr(self.params, 'cache_dir') and self.params.cache_dir:
                # for aether
                self.cache_dir = self.params.cache_dir
            else:
                if self.mode == 'normal':
                    if self.use_cache:
                        self.cache_dir = self.get_item(
                            ['outputs', 'cache_dir'])
                    else:
                        self.cache_dir = os.path.join(
                            tempfile.gettempdir(), 'neuron_blocks', ''.join(
                                random.sample(
                                    string.ascii_letters + string.digits, 16)))
                else:
                    # for philly mode, we can only save files in model_path or scratch_path
                    self.cache_dir = os.path.join(self.save_base_dir, 'cache')

            self.problem_path = os.path.join(self.cache_dir, 'problem.pkl')
            if self.pretrained_emb_path is not None:
                self.emb_pkl_path = os.path.join(self.cache_dir, 'emb.pkl')
            else:
                self.emb_pkl_path = None
        else:
            tmp_problem_path = os.path.join(self.save_base_dir,
                                            '.necessary_cache', 'problem.pkl')
            self.problem_path = tmp_problem_path if os.path.isfile(
                tmp_problem_path) else os.path.join(
                    self.save_base_dir, 'necessary_cache', 'problem.pkl')

        # training params
        self.training_params = self.get_item(['training_params'])

        if self.phase == 'train':
            self.optimizer_name = self.get_item(
                ['training_params', 'optimizer', 'name'])
            self.optimizer_params = self.get_item(
                ['training_params', 'optimizer', 'params'])
            self.clip_grad_norm_max_norm = self.get_item(
                ['training_params', 'clip_grad_norm_max_norm'], default=5)

            if hasattr(self.params,
                       'learning_rate') and self.params.learning_rate:
                self.optimizer_params['lr'] = self.params.learning_rate

        if hasattr(self.params, 'batch_size') and self.params.batch_size:
            self.batch_size_each_gpu = self.params.batch_size
        else:
            self.batch_size_each_gpu = self.get_item([
                'training_params', 'batch_size'
            ])  #the batch_size in conf file is the batch_size on each GPU
        self.lr_decay = self.get_item(['training_params', 'lr_decay'],
                                      default=1)  # by default, no decay
        self.minimum_lr = self.get_item(['training_params', 'minimum_lr'],
                                        default=0)
        self.epoch_start_lr_decay = self.get_item(
            ['training_params', 'epoch_start_lr_decay'], default=1)
        if hasattr(self.params, 'max_epoch') and self.params.max_epoch:
            self.max_epoch = self.params.max_epoch
        else:
            self.max_epoch = self.get_item(['training_params', 'max_epoch'],
                                           default=float('inf'))
        self.valid_times_per_epoch = self.get_item(
            ['training_params', 'valid_times_per_epoch'], default=1)
        self.batch_num_to_show_results = self.get_item(
            ['training_params', 'batch_num_to_show_results'], default=10)
        self.max_lengths = self.get_item(['training_params', 'max_lengths'],
                                         default=None,
                                         use_default=True)
        self.fixed_lengths = self.get_item(
            ['training_params', 'fixed_lengths'],
            default=None,
            use_default=True)
        if self.fixed_lengths:
            self.max_lengths = None

        if torch.cuda.device_count() > 1:
            self.batch_size_total = torch.cuda.device_count(
            ) * self.training_params['batch_size']
            self.batch_num_to_show_results = self.batch_num_to_show_results // torch.cuda.device_count(
            )
        else:
            self.batch_size_total = self.batch_size_each_gpu

        self.cpu_num_workers = self.get_item(
            ['training_params', 'cpu_num_workers'],
            default=-1)  #by default, use all workers cpu supports

        # text preprocessing
        self.__text_preprocessing = self.get_item(
            ['training_params', 'text_preprocessing'], default=list())
        self.DBC2SBC = True if 'DBC2SBC' in self.__text_preprocessing else False
        self.unicode_fix = True if 'unicode_fix' in self.__text_preprocessing else False
        self.remove_stopwords = True if 'remove_stopwords' in self.__text_preprocessing else False

        # tokenzier
        if self.language == 'chinese':
            self.tokenizer = self.get_item(['training_params', 'tokenizer'],
                                           default='jieba')
        else:
            self.tokenizer = self.get_item(['training_params', 'tokenizer'],
                                           default='nltk')

        if self.extra_feature:
            if self.DBC2SBC:
                logging.warning(
                    "Detect the extra feature %s, set the DBC2sbc is False." %
                    ''.join(list(feature_all - formal_feature)))
            if self.unicode_fix:
                logging.warning(
                    "Detect the extra feature %s, set the unicode_fix is False."
                    % ''.join(list(feature_all - formal_feature)))
            if self.remove_stopwords:
                logging.warning(
                    "Detect the extra feature %s, set the remove_stopwords is False."
                    % ''.join(list(feature_all - formal_feature)))

        if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
            if self.unicode_fix:
                logging.warning(
                    'For sequence tagging task, unicode_fix may change the number of words.'
                )
            if self.remove_stopwords:
                self.remove_stopwords = True
                logging.warning(
                    'For sequence tagging task, remove stopwords is forbidden! It is disabled now.'
                )

        if self.phase != 'cache':
            if torch.cuda.is_available(
            ) and torch.cuda.device_count() > 0 and self.training_params.get(
                    'use_gpu', True):
                self.use_gpu = True
                logging.info(
                    "Activating GPU mode, there are %d GPUs available" %
                    torch.cuda.device_count())
            else:
                self.use_gpu = False
                logging.info("Activating CPU mode")

        self.architecture = self.get_item(['architecture'])
        self.output_layer_id = []
        for single_layer in self.architecture:
            if 'output_layer_flag' in single_layer and single_layer[
                    'output_layer_flag']:
                self.output_layer_id.append(single_layer['layer_id'])

        # check CNN layer & change min sentence length
        cnn_rele_layers = ['Conv', 'ConvPooling']
        self.min_sentence_len = 0
        for layer_index, single_layer in enumerate(self.architecture):
            if layer_index == 0:
                continue
            if sum([_ == single_layer['layer'] for _ in cnn_rele_layers]):
                # get window_size conf: type maybe int or list
                for single_conf, single_conf_value in single_layer[
                        'conf'].items():
                    if 'window' in single_conf.lower():
                        self.min_sentence_len = max(
                            self.min_sentence_len,
                            np.max(np.array([single_conf_value])))
                        break

        if self.phase == 'train' or self.phase == 'test':
            self.loss = BaseLossConf.get_conf(**self.get_item(['loss']))
            self.metrics = self.get_item(['metrics'])
            if 'auc' in self.metrics and ProblemTypes[
                    self.problem_type] == ProblemTypes.classification:
                self.pos_label = self.get_item(['inputs', 'positive_label'],
                                               default=None,
                                               use_default=True)