Example #1
0
    def convetText2Parquet(self):

        jobName = self.app.pargs.job

        log = self.app.log

        log.info("begin to convert hive table from textfile to parquet ...")
        start_time = time.time()
        script = op.join(op.abspath(op.join(op.abspath(__file__), "../../")),
                         "core/kafka/KafkaConverter.py")

        # 执行ETL任务
        if self.app.pargs.batch:
            # 批量执行(逐天执行)
            for dt in pd.date_range(self.app.pargs.start_date,
                                    self.app.pargs.end_date,
                                    freq="D"):
                dt = dt.strftime("%Y%m%d")
                # cluster's python is python2.6.6
                bash = "python {main} --job={job} --date={dt}".format(
                    main=script, job=jobName, dt=dt)
                # 采用shell方式执行pyspark,避免依赖文件过多在集群上容易出错
                retval, msg = runShell(bash)
                if retval != 0:
                    log.error(
                        "batch mode ETL job={job} dt={dt} failed msg={msg}".
                        format(job=self.app.pargs.job, dt=dt, msg=msg))
                    exit(-1)

                log.info("batch mode ETL job={job} dt={dt} is complete".format(
                    job=self.app.pargs.job, dt=dt))
        else:
            # 单日执行
            dt = self.app.pargs.execute_date
            # cluster's python is python2.6.6
            bash = "python {main} --job={job} --date={dt}".format(main=script,
                                                                  job=jobName,
                                                                  dt=dt)
            # 采用shell方式执行pyspark,避免依赖文件过多在集群上容易出错
            retval, msg = runShell(bash)
            if retval != 0:
                log.error(
                    "batch mode ETL job={job} dt={dt} failed msg={msg}".format(
                        job=self.app.pargs.job, dt=dt, msg=msg))
                exit(-1)

        log.info(
            "convert hive table from textfile to parquet is done, elasped:{}mins"
            .format((time.time() - start_time) / 60.0))
Example #2
0
    def loadTsv2Hive(self):

        jobName = self.app.pargs.job
        hdfsDir = self.app.pargs.hdfs_dir
        sparkMaster = self.app.cfg['spark_master']

        log = self.app.log

        log.info("begin to load tsv file to hive table ...")
        start_time = time.time()
        script = op.join(op.abspath(op.join(op.abspath(__file__), "../../")), "core/bdp/BdpImportManager.py")

        # 执行ETL任务
        if self.app.pargs.batch:
            # 批量执行(逐天执行)
            for dt in pd.date_range(self.app.pargs.start_date, self.app.pargs.end_date, freq="D"):
                dt = dt.strftime("%Y%m%d")
                # cluster's python is python2.6.6
                bash = "python {main} --master={master} --job={job} --hdfs_dir={hdfs} --date={dt}" \
                    .format(main=script, master=sparkMaster, job=jobName, hdfs=hdfsDir, dt=dt)
                # 采用shell方式执行pyspark,避免依赖文件过多在集群上容易出错
                retval, msg = runShell(bash)
                if retval != 0:
                    log.error("batch mode ETL job={job} dt={dt} failed msg={msg}".format(job=self.app.pargs.job, dt=dt, msg=msg))
                    exit(-1)

                log.info("batch mode ETL job={job} dt={dt} is complete".format(job=self.app.pargs.job, dt=dt))
        else:
            # 单日执行
            dt = self.app.pargs.execute_date
            # cluster's python is python2.6.6
            bash = "python {main} --master={master} --job={job} --hdfs_dir={hdfs} --date={dt}" \
                .format(main=script, master=sparkMaster, job=jobName, hdfs=hdfsDir, dt=dt)
            # 采用shell方式执行pyspark,避免依赖文件过多在集群上容易出错
            retval, msg = runShell(bash)
            if retval != 0:
                log.error("batch mode ETL job={job} dt={dt} failed msg={msg}".format(job=self.app.pargs.job, dt=dt, msg=msg))
                exit(-1)

        log.info("load tsv file to hive is done, elasped:{}mins".format((time.time() - start_time) / 60.0))
Example #3
0
    def generateBiddingTraining(self):

        city = self.app.pargs.city
        start_date = self.app.pargs.start_date
        end_date = self.app.pargs.end_date

        log = self.app.log

        log.info("begin to generate bidding training data, city={city} start={start} end={end} ..."
                 .format(city=city, start_date=start_date, end_date=end_date))
        start_time = time.time()
        script = op.join(op.abspath(op.join(op.abspath(__file__), "../../")), "core/preprocess/OrderBasedBiddingMerger.py")

        # 执行数据转换任务
        bash = "python {main} --city={city} --start_date={start_date} --end_date={end_date}" \
                .format(main=script, city=city, start_date=start_date, end_date=end_date)
        # 采用shell方式执行pyspark,避免依赖文件过多在集群上容易出错
        retval, msg = runShell(bash)
        if retval != 0:
            log.error("generate bidding training data failed, msg={msg}".format(msg=msg))
                exit(-1)
Example #4
0
def text2parquet(job, date, hour=None):
    '''
    hive textfile to parquet,步骤如下:
    0、请检查parquet表已经创建
    1、从JobMeta字典获取任务的元信息,主要是转换的列desc,partition的个数
    2、给flume的外部表添加转换日期对应的partition
    3、可选:对应需要去重的数据源(主要是order和order_ext等mysql binlog的数据源),通过row_number()提取最后的值
    4、通过insert overwrite table XXX select COL1, COL2, ... COLn from XXX,进行数据转换

    :param job: 转换任务的名称,必须是JobMeta字典的key
    :param date: 转换日期,e.g. 20170323
    :param hour: 转换小时,可选项,目前仅供派单明细使用,e.g. 13
    :return:
    '''
    if job is None or len(job) == 0:
        raise ValueError("job is null or empty")

    if date is None or len(date) == 0:
        raise ValueError("date is null or empty")

    log = get_logger(NAME)
    t = time.time()

    table_from = JobMeta[job]['from']
    patitons = JobMeta[job]['patitons']
    table_to = JobMeta[job]['to']
    cols = getTitles(JobMeta[job]['desc'], JobMeta[job]['patitons'])
    where = JobMeta[job]['where']
    dropDuplicates = JobMeta[job]['dropDuplicates']

    log.info("hive textfile to parquet is start...")

    # 判断是否去重
    if dropDuplicates:

        if job in ["service_order", "service_order_ext"]:
            # flume source external table
            addPartitionSQL = "ALTER TABLE {table_from} ADD IF NOT EXISTS PARTITION (dt={dt})". \
                format(table_from=table_from, dt=date)

            # target parquet
            where = where.format(dt=date)
            distinctData = "(select *, ROW_NUMBER() over (distribute by service_order_id sort by update_time desc) as rank from {table_from} {where}) tmp where rank=1 and operation<>'DELETE'" \
                .format(table_from=table_from, where=where)
            convertSQL = "insert overwrite table {table_to} partition (dt={dt}) select {cols} from {distinctData}" \
                .format(table_to=table_to, dt=date, cols=cols, distinctData=distinctData)
        else:
            raise ValueError("unknow job name for dropDuplicates=True")
    else:

        if patitons > 1:

            if hour is None:
                raise ValueError("job=dispatch_detail hour is null")

            if job == "dispatch_detail":

                # flume接的数据会生成hour=01和hour=1,2个目录,实际数据在hour=01下面,手动mv数据
                if hour < 10:
                    cmd = "hadoop fs -mv /tmp/kafka/dispatch_detail/dt={dt}/hour=0{hour}/* /tmp/kafka/dispatch_detail/dt={dt}/hour={hour}/ ".format(dt=date, hour=hour)
                    log.info("mv flume sink data output -> {cmd}".format(cmd=cmd))
                    subprocess.call(cmd, shell=True)

                # flume source external table
                addPartitionSQL = "ALTER TABLE {table_from} ADD IF NOT EXISTS PARTITION (dt={dt}, hour={hour})".format(
                    table_from=table_from, dt=date, hour=hour)
            else:
                raise ValueError("unknow job name for patitons>1")

            where = where.format(dt=date, hour=hour)
            convertSQL = "insert overwrite table {table_to} partition (dt={dt}, hour={hour}) select {cols} from {table_from} {where}" \
                .format(table_to=table_to, dt=date, hour=hour, cols=cols, table_from=table_from, where=where)
        else:
            # flume source external table
            addPartitionSQL = "ALTER TABLE {table_from} ADD IF NOT EXISTS PARTITION (dt={dt})". \
                format(table_from=table_from, dt=date)

            where = where.format(dt=date)
            convertSQL = " insert overwrite table {table_to} partition (dt={dt}) select {cols} from {table_from} {where}" \
                .format(table_to=table_to, dt=date, cols=cols, table_from=table_from, where=where)

    # 组合最终的SQL
    auxlibSQL = "add jar hdfs:/libs/hive_auxlib/hive-contrib.jar"
    etlSQL = "{addAuxlib}; {addPartition}; {convert};".format(addAuxlib=auxlibSQL, addPartition=addPartitionSQL,
                                                              convert=convertSQL)
    # textfile -> parquet
    retval, msg = runShell("hive -e \"{sql}\"".format(sql=etlSQL))
    if retval != 0:
        log.error("convet hive from text to parquet failed, job={job} dt={dt} failed msg={msg}"
                  .format(job=job, dt=date, msg=msg))
        exit(-1)

    log.info("etl for {job} is complete! elasped: {time}".format(job=job, time=(time.time() - t)))
Example #5
0
        execute_date = arrow.get().to("Asia/Shanghai").replace(
            days=-1).format("YYYYMMDD")
    else:
        print "[TRACE] execute_date set by azkaban input flow parameters"
        execute_date = sys.argv[1]
    job = sys.argv[2]
    print "[INFO] execute_date = {dt} job={job}".format(dt=execute_date,
                                                        job=job)

    # step 1. 从dm1远程获取bdp导出的tsv文件
    filename = "{name}_{dt}.tsv".format(name=job, dt=execute_date)
    src_file_path = op.join(SRC_DIR, filename)
    dest_file_path = op.join(LOCAL_TSV_DIR, filename)
    bash = "sshpass -p {psw} scp {user}@{host}:{src} {dest}" \
        .format(user=SCP_USER, psw=SCP_PSW, host=SRC_HOST, src=src_file_path, dest=dest_file_path)
    retval, msg = runShell(bash)
    if retval != 0:
        print "[ERROR] scp tsv file from dm1 failed, msg={}".format(msg)
        exit(-1)

    # step 2. 将文件上传至hdfs,删除本地文件
    runShell("hadoop fs -rm -f {}".format(op.join(HDFS_TSV_DIR, filename)))
    bash = "hadoop fs -put {file} {hdfs}/".format(file=dest_file_path,
                                                  hdfs=HDFS_TSV_DIR)
    retval, msg = runShell(bash)
    if retval != 0:
        print "[ERROR] upload tsv file to hdfs failed, msg={}".format(msg)
        exit(-1)
    runShell("rm -f {}".format(dest_file_path))

    # step 3. 执行导入脚本(pyspark), 删除hdfs上的临时文件
UPLOAD_SCRIPT = op.join(PROJECT_ROOT_DIR, "dispatchETL/main.py")

if __name__ == "__main__":

    # 入参判断
    if len(sys.argv) != (2 + 1):
        print "[ERROR] args is invalid, inputs={},  e.g. python hiveText2Parquet.py 20170323 service_order".format(
            sys.argv)
        exit(-1)

    # step 0. 执行日期转换, azkaban全局变量中的execute_date作为hook,以便在web UI上通过指定execute_date执行backfill
    if sys.argv[1] == "hook":
        print "[TRACE] execute_date set by azkaban global.properties hook"
        # 默认情况下执行前一天的job
        execute_date = arrow.get().to("Asia/Shanghai").replace(days=-1).format("YYYYMMDD")
    else:
        print "[TRACE] execute_date set by azkaban input flow parameters"
        execute_date = sys.argv[1]
    job = sys.argv[2]
    print "[INFO] execute_date = {dt} job={job}".format(dt=execute_date, job=job)

    bash = "python {main} KafkaHiveConverter convetText2Parquet -E prod --job {job} --execute_date {dt}" \
        .format(main=UPLOAD_SCRIPT, job=job, dt=execute_date)
    retval, msg = runShell(bash)
    if retval != 0:
        print "[ERROR] convert hive from kafka textfile to parquet failed, msg={}".format(msg)
        exit(-1)

    print "[NOTICE] all is done"
    exit(0)
Example #7
0
        filename = "{name}_{dt}.txt".format(name=job, dt=execute_date)
        src_file_path = op.join(SRC_DIR_RC, filename)
        remote_file = "{user}@{host}:{src}".format(user=SCP_USER, host=SRC_HOST_RC, src=src_file_path)
    else:
        # 设备数据是推送到DM1
        filename = "{name}_{dt}.tsv".format(name=job, dt=execute_date)
        src_file_path = op.join(SRC_DIR, filename)
        remote_file = "{user}@{host}:{src}".format(user=SCP_USER, host=SRC_HOST, src=src_file_path)

    print "[INFO] execute_date = {dt} job={job}".format(dt=execute_date, job=job)

    # step 1. 从dm1远程获取bdp导出的tsv文件

    dest_file_path = op.join(LOCAL_TSV_DIR, filename)
    bash = "sshpass -p {psw} scp {remote_file} {dest}".format(psw=SCP_PSW, remote_file=remote_file, dest=dest_file_path)
    retval, msg = runShell(bash)
    if retval != 0:
        print "[ERROR] scp tsv file from dm1 failed, msg={}".format(msg)
        exit(-1)

    # step 2. 执行导入脚本导入mysql
    # cluster's python is python2.6.6
    bash = "python {main} MysqlImportor {cmd} -E prod --job {job} --execute_date {dt}" \
        .format(main=UPLOAD_SCRIPT, cmd=cmd, job=job, dt=execute_date)
    retval, msg = runShell(bash)
    if retval != 0:
        print "[ERROR] python script to load tsv into mysql failed, msg={}".format(msg)
        exit(-1)
    runShell("rm -f {}".format(dest_file_path))

    print "[NOTICE] all is done"