Example #1
0
    def _check_encoding(self, conf):
        self.encoding_invalid = True
        if not conf.pretrained_model_path and self.dictionary_invalid:
            return False

        # Calculate the MD5 of problem
        problem_path = conf.problem_path if not conf.pretrained_model_path else conf.saved_problem_path
        try:
            conf.problem_md5 = md5([problem_path])
        except Exception as e:
            conf.problem_md5 = None
            logging.info('Can not calculate md5 of problem.pkl from %s' %
                         (problem_path))
            return False

        # check the valid of encoding cache
        ## encoding cache dir
        conf.encoding_cache_dir = os.path.join(
            conf.cache_dir, conf.train_data_md5 + conf.problem_md5)
        logging.debug('[Cache] conf.encoding_cache_dir %s' %
                      (conf.encoding_cache_dir))
        if not os.path.exists(conf.encoding_cache_dir):
            return False

        ## encoding cache index
        conf.encoding_cache_index_file_path = os.path.join(
            conf.encoding_cache_dir, st.cencodig_index_file_name)
        conf.encoding_cache_index_file_md5_path = os.path.join(
            conf.encoding_cache_dir, st.cencoding_index_md5_file_name)
        if not os.path.exists(
                conf.encoding_cache_index_file_path) or not os.path.exists(
                    conf.encoding_cache_index_file_md5_path):
            return False
        if md5([conf.encoding_cache_index_file_path]) != load_from_json(
                conf.encoding_cache_index_file_md5_path):
            return False
        cache_index = load_from_json(conf.encoding_cache_index_file_path)

        ## encoding cache content
        for index in cache_index[st.cencoding_key_index]:
            file_name, file_md5 = index[0], index[1]
            if file_md5 != md5(
                [os.path.join(conf.encoding_cache_dir, file_name)]):
                return False

        if (st.cencoding_key_legal_cnt
                in cache_index) and (st.cencoding_key_illegal_cnt
                                     in cache_index):
            conf.encoding_cache_legal_line_cnt = cache_index[
                st.cencoding_key_legal_cnt]
            conf.encoding_cache_illegal_line_cnt = cache_index[
                st.cencoding_key_illegal_cnt]

        self.encoding_invalid = False
        logging.info('[Cache] encoding found')
        logging.info('%s: %d legal samples, %d illegal samples' %
                     (conf.train_data_path, conf.encoding_cache_legal_line_cnt,
                      conf.encoding_cache_illegal_line_cnt))
        return True
Example #2
0
    def _prepare_encoding_cache(self, conf, problem, build=False):
        # encoding cache dir
        problem_path = conf.problem_path if not conf.pretrained_model_path else conf.saved_problem_path
        conf.problem_md5 = md5([problem_path])
        conf.encoding_cache_dir = os.path.join(
            conf.cache_dir, conf.train_data_md5 + conf.problem_md5)
        if not os.path.exists(conf.encoding_cache_dir):
            os.makedirs(conf.encoding_cache_dir)

        # encoding cache files
        conf.encoding_cache_index_file_path = os.path.join(
            conf.encoding_cache_dir, st.cencodig_index_file_name)
        conf.encoding_cache_index_file_md5_path = os.path.join(
            conf.encoding_cache_dir, st.cencoding_index_md5_file_name)
        conf.load_encoding_cache_generator = self._load_encoding_cache_generator

        if build:
            prepare_dir(conf.encoding_cache_dir,
                        True,
                        allow_overwrite=True,
                        clear_dir_if_exist=True)
            problem.build_encode_cache(conf)
            self.encoding_invalid = False

        if not self.encoding_invalid:
            cache_index = load_from_json(conf.encoding_cache_index_file_path)
            conf.encoding_file_index = cache_index[st.cencoding_key_index]
Example #3
0
    def configurate_cache(self):
        # whether use cache
        if self.mode == 'philly':
            self.use_cache = True

        # cache dir
        if self.phase == 'train':
            if hasattr(self.params, 'cache_dir') and self.params.cache_dir:
                self.cache_dir = self.params.cache_dir
            else:
                if self.mode == 'normal':
                    if self.use_cache is False:
                        self.cache_dir = os.path.join(
                            tempfile.gettempdir(), 'neuron_blocks', ''.join(
                                random.sample(
                                    string.ascii_letters + string.digits, 16)))
                else:
                    # for philly mode, we can only save files in model_path or scratch_path
                    self.cache_dir = os.path.join(self.save_base_dir, 'cache')

            self.problem_path = os.path.join(self.cache_dir, 'problem.pkl')
            if self.pretrained_emb_path is not None:
                self.emb_pkl_path = os.path.join(self.cache_dir, 'emb.pkl')
            else:
                self.emb_pkl_path = None
        else:
            tmp_problem_path = os.path.join(self.save_base_dir,
                                            '.necessary_cache', 'problem.pkl')
            self.problem_path = tmp_problem_path if os.path.isfile(
                tmp_problem_path) else os.path.join(
                    self.save_base_dir, 'necessary_cache', 'problem.pkl')

        # md5 of training data and problem
        self.train_data_md5 = None
        if self.phase == 'train' and self.train_data_path:
            logging.info("Calculating the md5 of traing data ...")
            self.train_data_md5 = md5([self.train_data_path])
            logging.info("the md5 of traing data is %s" %
                         (self.train_data_md5))
        self.problem_md5 = None

        # encoding
        self.encoding_cache_dir = None
        self.encoding_cache_index_file_path = None
        self.encoding_cache_index_file_md5_path = None
        self.encoding_file_index = None
        self.encoding_cache_legal_line_cnt = 0
        self.encoding_cache_illegal_line_cnt = 0
        self.load_encoding_cache_generator = None
Example #4
0
    def _load_cache_config_from_conf(self):
        # training data
        self.train_data_md5 = None
        if self.phase == 'train' and self.train_data_path:
            logging.info("Calculating the md5 of traing data ...")
            self.train_data_md5 = md5([self.train_data_path])
            logging.info("the md5 of traing data is %s" %
                         (self.train_data_md5))

        # problem
        self.problem_md5 = None

        # encoding
        self.encoding_cache_dir = None
        self.encoding_cache_index_file_path = None
        self.encoding_cache_index_file_md5_path = None
        self.encoding_file_index = None
        self.encoding_cache_legal_line_cnt = 0
        self.encoding_cache_illegal_line_cnt = 0
        self.load_encoding_cache_generator = None