def _xlearning_submit(self, job_cmd, worker_num, data_names): # prepare result hdfs data path for name in data_names: path = os.path.join(self._hdfs_dir, f'{name}_pred') if hdfs.exists(path): hdfs.rm(path) hdfs.mkdir(path) inputs = [f'--input {self._hdfs_dir}/{i}#{i}' for i in data_names] cmd = ' '.join([ f'{xlearning.XL_SUBMIT}', f'--app-type "tensorflow"', f'--app-name "prediction-{self._job_id}"', f'--cacheArchive {HDFS_CODE_CACHE}#libs,{PYTHON_ENV_CACHE}#python3', f'--launch-cmd "{job_cmd}"', f'--worker-memory {xlearning.WORKER_MEMORY}', f'--worker-num {worker_num}', # f'--worker-cores {XLearningConfig["worker_cores"]}', f'--ps-num {xlearning.PS_NUM}', f'--queue default', f'--user-path ./python3/bin', f'--board-enable false', f'--jars {xlearning.JARS}', ] + inputs) logger.info(cmd) run_cmd(cmd)
def readDays(self, start_date, end_date, prop, session=None, **kwargs): sqlList = self._feature.get_day_sql_list(start_date, end_date, **kwargs) retDf = None for s, d in sqlList: kwargs[self._feature._data_date_col] = d output_file = self._feature.get_output_name(d, **kwargs) output_path = hadoop_conf.HDFS_FEATURE_ROOT + '/' + self._feature._name + '/' + output_file df = None if hdfs.exists(output_path): logger.info( "feature {name} file {path} is exist! we will use file.". format(name=self._feature._name, path=output_path)) df = session.read.parquet(output_path) else: logger.info( "feature {name} file {path} is not exist! we get data from clickhouse." .format(name=self._feature._name, path=output_path)) s = self.jdbc_sql(s) df = session.read.jdbc(self._url, s, properties=prop) df.write.parquet(path=output_path, mode='overwrite') if not retDf: retDf = df else: retDf = retDf.union(df) return retDf
def pack_libs(overwrite=True): path = os.path.join(os.getcwd(), LIB_NAME) zip_path = f'{LIB_NAME}.zip' zip_dir(path, zip_path) from conf.hadoop import HDFS_CODE_CACHE if hdfs.exists(HDFS_CODE_CACHE): if not overwrite: logger.info(f'{zip_path} is exist! {HDFS_CODE_CACHE}') return hdfs.rm(HDFS_CODE_CACHE) hdfs.mkdir(os.path.dirname(HDFS_CODE_CACHE)) hdfs.put(zip_path, HDFS_CODE_CACHE) logger.info(f'success upload {zip_path} to {HDFS_CODE_CACHE}')
def _xlearning_submit(self, epoch, batch_size, worker_num, input_dim, data_name): output_path = os.path.join(self._hdfs_dir, self.get_model_name()) if hdfs.exists(output_path): hdfs.rm(output_path) entrance = self.get_worker_entrance() logger.info('[%s] start xlearning submit ...', self._job_id) worker_cmd = ' '.join([ f'{WORKER_PYTHON} {entrance}', f'--job_id={self._job_id}', f'--hdfs_dir={self._hdfs_dir}', f'--data={data_name}', f'--model={self.get_model_name()}', f'--log_dir={_training_log_dir}', f'--training_epochs={epoch}', f'--input_dim={input_dim}', f'--learning_rate={self._learning_rate}', f'--batch_size={batch_size}', f'--l2={self._l2}', ]) driver_cmd = ' '.join([ f'{xlearning.XL_SUBMIT}', f'--app-type "tensorflow"', f'--app-name "CTR-{self._job_id}"', f'--launch-cmd "{worker_cmd}"', f'--input {self._hdfs_dir}/{data_name}#{data_name}', f'--output {self._hdfs_dir}/{self.get_model_name()}#{self.get_model_name()}', f'--board-logdir {_training_log_dir}', f'--cacheArchive {HDFS_CODE_CACHE}#libs,{PYTHON_ENV_CACHE}#python3', f'--worker-memory {xlearning.WORKER_MEMORY}', f'--worker-num {worker_num}', f'--worker-cores {xlearning.WORKER_CORES}', f'--ps-memory {xlearning.PS_MEMORY}', f'--ps-num {xlearning.PS_NUM}', f'--ps-cores {xlearning.PS_CORES}', f'--queue default', f'--user-path ./python3/bin', f'--jars {xlearning.JARS}', # '-Duser.timezone=UTC+0800', ]) logger.info(driver_cmd) run_cmd(driver_cmd) logger.info('finish training process successful.')
def init_hdfs_dir(hdfs_dir, clean_old=True): if hdfs.exists(hdfs_dir) and clean_old: hdfs.rm(hdfs_dir)
def clean_task_dir(runtime): if hdfs.exists(runtime.hdfs_dir): hdfs.rm(runtime.hdfs_dir) if os.path.exists(runtime.local_dir): shutil.rmtree(runtime.local_dir)
def readDaysWithSql(self, start_date, end_date, sql_template, output_template, prop, batch_cond=None, use_jdbc=True, session=None, suffix="", **kwargs): sqlList = [] hdfs_files_list = [] if batch_cond: for cond in batch_cond: kwargs.update(cond) sl = self._feature.get_day_sql_list(start_date, end_date, sql_template, **kwargs) for sql, day in sl: sqlList.append((sql, day, cond)) else: sl = self._feature.get_day_sql_list(start_date, end_date, sql_template, **kwargs) for sql, day in sl: sqlList.append((sql, day, {})) ret_df = None for s, d, cond in sqlList: kwargs[self._feature._data_date_col] = d kwargs.update(cond) output_file = output_template.format(**kwargs) + suffix output_path = hadoop_conf.HDFS_FEATURE_ROOT + '/' + self._feature._name + '/' + output_file if hdfs.exists(output_path): logger.info( "feature {name} file {path} is exist! we will use file.". format(name=self._feature._name, path=output_path)) hdfs_files_list.append(output_path) df = None else: logger.info( "feature {name} file {path} is not exist! we get data from clickhouse." .format(name=self._feature._name, path=output_path)) if use_jdbc: s = self.jdbc_sql(s) df = session.read.jdbc(self._url, s, properties=prop) else: df = session.sql(sql) df.write.parquet(path=output_path, mode='overwrite') if not ret_df: ret_df = df else: if df: ret_df = ret_df.union(df) if not ret_df and len(hdfs_files_list) > 0: ret_df = session.read.parquet(*hdfs_files_list) else: df = None if len(hdfs_files_list) > 0: df = session.read.parquet(*hdfs_files_list) if df: ret_df.union(df) return ret_df