def convetText2Parquet(self): jobName = self.app.pargs.job log = self.app.log log.info("begin to convert hive table from textfile to parquet ...") start_time = time.time() script = op.join(op.abspath(op.join(op.abspath(__file__), "../../")), "core/kafka/KafkaConverter.py") # 执行ETL任务 if self.app.pargs.batch: # 批量执行(逐天执行) for dt in pd.date_range(self.app.pargs.start_date, self.app.pargs.end_date, freq="D"): dt = dt.strftime("%Y%m%d") # cluster's python is python2.6.6 bash = "python {main} --job={job} --date={dt}".format( main=script, job=jobName, dt=dt) # 采用shell方式执行pyspark,避免依赖文件过多在集群上容易出错 retval, msg = runShell(bash) if retval != 0: log.error( "batch mode ETL job={job} dt={dt} failed msg={msg}". format(job=self.app.pargs.job, dt=dt, msg=msg)) exit(-1) log.info("batch mode ETL job={job} dt={dt} is complete".format( job=self.app.pargs.job, dt=dt)) else: # 单日执行 dt = self.app.pargs.execute_date # cluster's python is python2.6.6 bash = "python {main} --job={job} --date={dt}".format(main=script, job=jobName, dt=dt) # 采用shell方式执行pyspark,避免依赖文件过多在集群上容易出错 retval, msg = runShell(bash) if retval != 0: log.error( "batch mode ETL job={job} dt={dt} failed msg={msg}".format( job=self.app.pargs.job, dt=dt, msg=msg)) exit(-1) log.info( "convert hive table from textfile to parquet is done, elasped:{}mins" .format((time.time() - start_time) / 60.0))
def loadTsv2Hive(self): jobName = self.app.pargs.job hdfsDir = self.app.pargs.hdfs_dir sparkMaster = self.app.cfg['spark_master'] log = self.app.log log.info("begin to load tsv file to hive table ...") start_time = time.time() script = op.join(op.abspath(op.join(op.abspath(__file__), "../../")), "core/bdp/BdpImportManager.py") # 执行ETL任务 if self.app.pargs.batch: # 批量执行(逐天执行) for dt in pd.date_range(self.app.pargs.start_date, self.app.pargs.end_date, freq="D"): dt = dt.strftime("%Y%m%d") # cluster's python is python2.6.6 bash = "python {main} --master={master} --job={job} --hdfs_dir={hdfs} --date={dt}" \ .format(main=script, master=sparkMaster, job=jobName, hdfs=hdfsDir, dt=dt) # 采用shell方式执行pyspark,避免依赖文件过多在集群上容易出错 retval, msg = runShell(bash) if retval != 0: log.error("batch mode ETL job={job} dt={dt} failed msg={msg}".format(job=self.app.pargs.job, dt=dt, msg=msg)) exit(-1) log.info("batch mode ETL job={job} dt={dt} is complete".format(job=self.app.pargs.job, dt=dt)) else: # 单日执行 dt = self.app.pargs.execute_date # cluster's python is python2.6.6 bash = "python {main} --master={master} --job={job} --hdfs_dir={hdfs} --date={dt}" \ .format(main=script, master=sparkMaster, job=jobName, hdfs=hdfsDir, dt=dt) # 采用shell方式执行pyspark,避免依赖文件过多在集群上容易出错 retval, msg = runShell(bash) if retval != 0: log.error("batch mode ETL job={job} dt={dt} failed msg={msg}".format(job=self.app.pargs.job, dt=dt, msg=msg)) exit(-1) log.info("load tsv file to hive is done, elasped:{}mins".format((time.time() - start_time) / 60.0))
def generateBiddingTraining(self): city = self.app.pargs.city start_date = self.app.pargs.start_date end_date = self.app.pargs.end_date log = self.app.log log.info("begin to generate bidding training data, city={city} start={start} end={end} ..." .format(city=city, start_date=start_date, end_date=end_date)) start_time = time.time() script = op.join(op.abspath(op.join(op.abspath(__file__), "../../")), "core/preprocess/OrderBasedBiddingMerger.py") # 执行数据转换任务 bash = "python {main} --city={city} --start_date={start_date} --end_date={end_date}" \ .format(main=script, city=city, start_date=start_date, end_date=end_date) # 采用shell方式执行pyspark,避免依赖文件过多在集群上容易出错 retval, msg = runShell(bash) if retval != 0: log.error("generate bidding training data failed, msg={msg}".format(msg=msg)) exit(-1)
def text2parquet(job, date, hour=None): ''' hive textfile to parquet,步骤如下: 0、请检查parquet表已经创建 1、从JobMeta字典获取任务的元信息,主要是转换的列desc,partition的个数 2、给flume的外部表添加转换日期对应的partition 3、可选:对应需要去重的数据源(主要是order和order_ext等mysql binlog的数据源),通过row_number()提取最后的值 4、通过insert overwrite table XXX select COL1, COL2, ... COLn from XXX,进行数据转换 :param job: 转换任务的名称,必须是JobMeta字典的key :param date: 转换日期,e.g. 20170323 :param hour: 转换小时,可选项,目前仅供派单明细使用,e.g. 13 :return: ''' if job is None or len(job) == 0: raise ValueError("job is null or empty") if date is None or len(date) == 0: raise ValueError("date is null or empty") log = get_logger(NAME) t = time.time() table_from = JobMeta[job]['from'] patitons = JobMeta[job]['patitons'] table_to = JobMeta[job]['to'] cols = getTitles(JobMeta[job]['desc'], JobMeta[job]['patitons']) where = JobMeta[job]['where'] dropDuplicates = JobMeta[job]['dropDuplicates'] log.info("hive textfile to parquet is start...") # 判断是否去重 if dropDuplicates: if job in ["service_order", "service_order_ext"]: # flume source external table addPartitionSQL = "ALTER TABLE {table_from} ADD IF NOT EXISTS PARTITION (dt={dt})". \ format(table_from=table_from, dt=date) # target parquet where = where.format(dt=date) distinctData = "(select *, ROW_NUMBER() over (distribute by service_order_id sort by update_time desc) as rank from {table_from} {where}) tmp where rank=1 and operation<>'DELETE'" \ .format(table_from=table_from, where=where) convertSQL = "insert overwrite table {table_to} partition (dt={dt}) select {cols} from {distinctData}" \ .format(table_to=table_to, dt=date, cols=cols, distinctData=distinctData) else: raise ValueError("unknow job name for dropDuplicates=True") else: if patitons > 1: if hour is None: raise ValueError("job=dispatch_detail hour is null") if job == "dispatch_detail": # flume接的数据会生成hour=01和hour=1,2个目录,实际数据在hour=01下面,手动mv数据 if hour < 10: cmd = "hadoop fs -mv /tmp/kafka/dispatch_detail/dt={dt}/hour=0{hour}/* /tmp/kafka/dispatch_detail/dt={dt}/hour={hour}/ ".format(dt=date, hour=hour) log.info("mv flume sink data output -> {cmd}".format(cmd=cmd)) subprocess.call(cmd, shell=True) # flume source external table addPartitionSQL = "ALTER TABLE {table_from} ADD IF NOT EXISTS PARTITION (dt={dt}, hour={hour})".format( table_from=table_from, dt=date, hour=hour) else: raise ValueError("unknow job name for patitons>1") where = where.format(dt=date, hour=hour) convertSQL = "insert overwrite table {table_to} partition (dt={dt}, hour={hour}) select {cols} from {table_from} {where}" \ .format(table_to=table_to, dt=date, hour=hour, cols=cols, table_from=table_from, where=where) else: # flume source external table addPartitionSQL = "ALTER TABLE {table_from} ADD IF NOT EXISTS PARTITION (dt={dt})". \ format(table_from=table_from, dt=date) where = where.format(dt=date) convertSQL = " insert overwrite table {table_to} partition (dt={dt}) select {cols} from {table_from} {where}" \ .format(table_to=table_to, dt=date, cols=cols, table_from=table_from, where=where) # 组合最终的SQL auxlibSQL = "add jar hdfs:/libs/hive_auxlib/hive-contrib.jar" etlSQL = "{addAuxlib}; {addPartition}; {convert};".format(addAuxlib=auxlibSQL, addPartition=addPartitionSQL, convert=convertSQL) # textfile -> parquet retval, msg = runShell("hive -e \"{sql}\"".format(sql=etlSQL)) if retval != 0: log.error("convet hive from text to parquet failed, job={job} dt={dt} failed msg={msg}" .format(job=job, dt=date, msg=msg)) exit(-1) log.info("etl for {job} is complete! elasped: {time}".format(job=job, time=(time.time() - t)))
execute_date = arrow.get().to("Asia/Shanghai").replace( days=-1).format("YYYYMMDD") else: print "[TRACE] execute_date set by azkaban input flow parameters" execute_date = sys.argv[1] job = sys.argv[2] print "[INFO] execute_date = {dt} job={job}".format(dt=execute_date, job=job) # step 1. 从dm1远程获取bdp导出的tsv文件 filename = "{name}_{dt}.tsv".format(name=job, dt=execute_date) src_file_path = op.join(SRC_DIR, filename) dest_file_path = op.join(LOCAL_TSV_DIR, filename) bash = "sshpass -p {psw} scp {user}@{host}:{src} {dest}" \ .format(user=SCP_USER, psw=SCP_PSW, host=SRC_HOST, src=src_file_path, dest=dest_file_path) retval, msg = runShell(bash) if retval != 0: print "[ERROR] scp tsv file from dm1 failed, msg={}".format(msg) exit(-1) # step 2. 将文件上传至hdfs,删除本地文件 runShell("hadoop fs -rm -f {}".format(op.join(HDFS_TSV_DIR, filename))) bash = "hadoop fs -put {file} {hdfs}/".format(file=dest_file_path, hdfs=HDFS_TSV_DIR) retval, msg = runShell(bash) if retval != 0: print "[ERROR] upload tsv file to hdfs failed, msg={}".format(msg) exit(-1) runShell("rm -f {}".format(dest_file_path)) # step 3. 执行导入脚本(pyspark), 删除hdfs上的临时文件
UPLOAD_SCRIPT = op.join(PROJECT_ROOT_DIR, "dispatchETL/main.py") if __name__ == "__main__": # 入参判断 if len(sys.argv) != (2 + 1): print "[ERROR] args is invalid, inputs={}, e.g. python hiveText2Parquet.py 20170323 service_order".format( sys.argv) exit(-1) # step 0. 执行日期转换, azkaban全局变量中的execute_date作为hook,以便在web UI上通过指定execute_date执行backfill if sys.argv[1] == "hook": print "[TRACE] execute_date set by azkaban global.properties hook" # 默认情况下执行前一天的job execute_date = arrow.get().to("Asia/Shanghai").replace(days=-1).format("YYYYMMDD") else: print "[TRACE] execute_date set by azkaban input flow parameters" execute_date = sys.argv[1] job = sys.argv[2] print "[INFO] execute_date = {dt} job={job}".format(dt=execute_date, job=job) bash = "python {main} KafkaHiveConverter convetText2Parquet -E prod --job {job} --execute_date {dt}" \ .format(main=UPLOAD_SCRIPT, job=job, dt=execute_date) retval, msg = runShell(bash) if retval != 0: print "[ERROR] convert hive from kafka textfile to parquet failed, msg={}".format(msg) exit(-1) print "[NOTICE] all is done" exit(0)
filename = "{name}_{dt}.txt".format(name=job, dt=execute_date) src_file_path = op.join(SRC_DIR_RC, filename) remote_file = "{user}@{host}:{src}".format(user=SCP_USER, host=SRC_HOST_RC, src=src_file_path) else: # 设备数据是推送到DM1 filename = "{name}_{dt}.tsv".format(name=job, dt=execute_date) src_file_path = op.join(SRC_DIR, filename) remote_file = "{user}@{host}:{src}".format(user=SCP_USER, host=SRC_HOST, src=src_file_path) print "[INFO] execute_date = {dt} job={job}".format(dt=execute_date, job=job) # step 1. 从dm1远程获取bdp导出的tsv文件 dest_file_path = op.join(LOCAL_TSV_DIR, filename) bash = "sshpass -p {psw} scp {remote_file} {dest}".format(psw=SCP_PSW, remote_file=remote_file, dest=dest_file_path) retval, msg = runShell(bash) if retval != 0: print "[ERROR] scp tsv file from dm1 failed, msg={}".format(msg) exit(-1) # step 2. 执行导入脚本导入mysql # cluster's python is python2.6.6 bash = "python {main} MysqlImportor {cmd} -E prod --job {job} --execute_date {dt}" \ .format(main=UPLOAD_SCRIPT, cmd=cmd, job=job, dt=execute_date) retval, msg = runShell(bash) if retval != 0: print "[ERROR] python script to load tsv into mysql failed, msg={}".format(msg) exit(-1) runShell("rm -f {}".format(dest_file_path)) print "[NOTICE] all is done"