def get_driver_num(**op_kwargs): driver_num = {} res = [] conn = get_db_conn('mysql_oride_data_readonly') mcursor = conn.cursor() driver_id = -1 results = tuple() driver_dic = {} while True: sql = query_driver_city_serv.format(id=driver_id) logging.info(sql) mcursor.execute(sql) conn.commit() tmp = mcursor.fetchall() if not tmp: break results += tmp driver_id = tmp[-1][0] mcursor.close() conn.close() for data in results: driver_dic[data[0]] = ",".join([str(data[1]), str(data[2])]) redis_conn = RedisHook(redis_conn_id='pika_85').get_conn() ts = op_kwargs['ts'] dt, h = ts.split('T') dt = dt + ' ' + h.split('+')[0] time_array = time.strptime(dt, "%Y-%m-%d %H:%M:%S") timestamp = int(time.mktime(time_array)) a_member = set() no_member = set() dt_start = time.strftime('%Y%m%d%H%M', time.localtime(timestamp)) for i in range(0, 10): dt = time.strftime('%Y%m%d%H%M', time.localtime(timestamp + i * 60)) a_member = a_member.union(set(redis_conn.smembers(active_a_driver % dt))) no_member = no_member.union(set(redis_conn.smembers(active_no_driver % dt))) for mem in a_member: tmp = driver_dic.get(int(mem), '0,0') if tmp not in driver_num: driver_num[tmp] = {"a_mem": 0, "no_mem": 0} driver_num[tmp]["a_mem"] += 1 for mem in no_member: tmp = driver_dic.get(int(mem), '0,0') if tmp not in driver_num: driver_num[tmp] = {"a_mem": 0, "no_mem": 0} driver_num[tmp]["no_mem"] += 1 for k, v in driver_num.items(): info = k.split(",") res.append([int(info[0]), int(info[1]), dt_start+'00', v["a_mem"], v["no_mem"]]) conn = get_db_conn('mysql_bi') mcursor = conn.cursor() mcursor.executemany(insert_driver_num, res) logging.info('insert num %s, data %s', len(res), str(res)) conn.commit() mcursor.close() conn.close()
def not_pay_push(**op_kwargs): dt = op_kwargs.get('ds') env = op_kwargs.get('env', 'prod') lagos_9_clock_timestamp = get_lagos_timestamp(dt) cursor = get_hive_cursor() table_name = 'data_order' table_name2 = 'data_user_whitelist' if env == 'test': table_name += '_dev' table_name2 += '_dev' cursor.execute("msck repair table oride_db.%s" % table_name) cursor.execute("msck repair table oride_db.%s" % table_name2) cursor.execute( not_pay_hql.format(table_name=table_name, table_name2=table_name2, dt=dt)) res = [x[0] for x in cursor.fetchall()] print("not pay order ids: %d" % len(res)) step = 100 db_name = 'sqoop_db' if env == 'test': db_name += '_test' mysql_cursor = get_db_conn(db_name).cursor() uids = set() for i in range(0, len(res), step): tmp = [str(x) for x in res[i:i + step]] sql = not_pay_sql.format(ids=','.join(tmp)) mysql_cursor.execute(sql) data = mysql_cursor.fetchall() for rec in data: uids.add(rec[0]) print("not pay user ids: %d" % len(uids)) print(uids) for uid in uids: send_push(env, 1, uid, lagos_9_clock_timestamp, "not_pay")
def get_driver_online_time(ds, **op_kwargs): dt = op_kwargs["ds_nodash"] conn = get_db_conn('timerange_conn_db') mcursor = conn.cursor() mcursor.execute(get_driver_id) result = mcursor.fetchone() conn.commit() mcursor.close() conn.close() processes = [] max_driver_id = result[0] logging.info('max driver id %d', max_driver_id) id_list = [x for x in range(1, max_driver_id+1)] part_size = 1000 index = 0 manager = Manager() rows = manager.list([]) while index < max_driver_id: p = Process(target=get_driver_timerange, args=(id_list[index:index + part_size], dt, rows)) index += part_size processes.append(p) p.start() for p in processes: p.join() if rows: query = """ INSERT OVERWRITE TABLE oride_dw_ods.{tab_name} PARTITION (dt='{dt}') VALUES {value} """.format(dt=ds, value=','.join(rows),tab_name=table_name) logging.info('import_driver_online_time run sql:%s' % query) hive_hook = HiveCliHook() hive_hook.run_cli(query)
def write_email(**op_kwargs): dt = op_kwargs.get('ds') init_day = n_days_ago(dt, QUERY_DATA_RANGE) sql_conn = get_db_conn() sql_cursor = sql_conn.cursor() sql_cursor.execute(QUERY_EMAIL_DATA % (init_day, dt)) res = sql_cursor.fetchall() res = list(res) if len(res) < 1: return res = map(list, res) arr = [] for elem in res: elem[1] = elem[1].strftime('%Y-%m-%d') arr.append(elem) arr.sort(key=lambda x: x[1], reverse=True) h = mail_msg_header.format(dt1=arr[0][1], dt2=arr[-1][1]) for x in range(len(col_meaning)): if x in not_show_indexs: continue h += part_html1.format(key=col_meaning[x]) for y in range(len(arr)): tmp_val = arr[y][x + 1] if tmp_val is None: tmp_val = "-" elif "ratio" in col_meaning[x] or "rate" in col_meaning[ x] or "/" in col_meaning[x]: tmp_val = "%.2f%%" % (tmp_val * 100) h += part_html2.format( val=tmp_val) if x > 0 else part_html2_1.format(val=tmp_val) h += part_html3 h += mail_msg_tail h += css_style message = MIMEMultipart() subject = 'Oride {dt1} -- {dt2} Daily Report'.format(dt1=arr[0][1], dt2=arr[-1][1]) message['Subject'] = Header(subject, 'utf-8') message.attach(MIMEText(h, 'html', 'utf-8')) att1 = MIMEText( open("/tmp/%s_driver_data.csv" % dt, 'r').read(), 'plain', 'utf-8') att1["Content-Type"] = 'application/octet-stream' att1[ "Content-Disposition"] = 'attachment; filename="driver_stat_%s.csv"' % dt message.attach(att1) att2 = MIMEText( open("/tmp/%s_online_driver_num.csv" % dt, 'r').read(), 'plain', 'utf-8') att2["Content-Type"] = 'application/octet-stream' att2[ "Content-Disposition"] = 'attachment; filename="%s_online_driver_num.csv"' % dt message.attach(att2) try: server = smtplib.SMTP('mail.opay-inc.com', 25) server.ehlo() server.starttls() server.login(sender, password) server.sendmail(sender, receivers, message.as_string()) print("邮件发送成功") except smtplib.SMTPException as e: print(e.message)
def hiveresult_to_channel_mysql(ds, **kwargs): cursor = get_hive_cursor() logging.info(kwargs['sql'].format(ds=ds)) cursor.execute(kwargs['sql'].format(ds=ds)) results = cursor.fetchall() mysql_conn = get_db_conn('opay_spread_mysql') mcursor = mysql_conn.cursor() sql_insert = kwargs['sql_insert'] sql_val = '' sql_ext = kwargs['sql_ext'] sql_count = 0 for day, channel, driver_type, drivers in results: sql_tmp = "('{day}', '{channel}', '{driver_type}', '{dirvers}')".format( day=day, channel=channel, driver_type=driver_type, dirvers=drivers) if sql_val == '': sql_val = sql_tmp else: sql_val += ',' + sql_tmp sql_count += 1 if sql_count >= 1000: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext # logging.info(sql) mcursor.execute(sql) sql_count = 0 sql_val = '' if sql_count > 0: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext mcursor.execute(sql) mysql_conn.commit() cursor.close() mcursor.close() mysql_conn.close()
def __get_mysql_table_schema(self, mysql_db, mysql_table, mysql_conn): mcursor = self.mysql_cursor.get(mysql_conn, None) if not mcursor: sqlconn = get_db_conn(mysql_conn) mcursor = self.mysql_cursor[mysql_conn] = sqlconn.cursor() sql = ''' SELECT COLUMN_NAME, DATA_TYPE, COLUMN_COMMENT, COLUMN_TYPE FROM information_schema.COLUMNS WHERE TABLE_SCHEMA='{db}' AND TABLE_NAME='{table}' ORDER BY ORDINAL_POSITION '''.format( db=mysql_db, table=mysql_table ) logging.info(sql) mcursor.execute(sql) res = mcursor.fetchall() # logging.info(res) mysql_schema = [] for (column_name, data_type, column_comment, column_type) in res: mysql_schema.append({ 'column': column_name, 'column_info': "`%s` %s comment '%s'" % ( column_name, self.mysql_type_to_hive.get(data_type.upper(), 'string'), column_comment), 'column_type': data_type.upper().strip() }) logging.info(mysql_schema) return mysql_schema
def get_data_from_hive(ds, execution_date, **op_kwargs): # ds = op_kwargs.get('ds', time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))) hql = ''' SELECT create_date_hour , sub_service_type , state , region , order_status , order_cnt , order_amt, country_code , dt , hour from opay_dw.app_opay_cico_sum_ng_h where country_code = 'NG' and concat(dt,' ',hour) >= date_format(default.localTime("{config}", 'NG', '{v_date}', -1), 'yyyy-MM-dd HH') and concat(dt,' ',hour) <= date_format(default.localTime("{config}", 'NG', '{v_date}', 0), 'yyyy-MM-dd HH') '''.format( pt=ds, v_date=execution_date.strftime("%Y-%m-%d %H:%M:%S"), config=config ) logging.info(hql) hive_cursor = get_hive_cursor() hive_cursor.execute(hql) hive_data = hive_cursor.fetchall() mysql_conn = get_db_conn('mysql_bi') mcursor = mysql_conn.cursor() __data_only_mysql( mcursor, execution_date ) __data_to_mysql( mcursor, hive_data, [ 'create_date_hour', 'sub_service_type', 'state', 'region', 'order_status', 'order_cnt', 'order_amt', 'country_code', 'dt', 'hour' ] ) hive_cursor.close() mcursor.close()
def csresult_channel_to_mysql(ds, **kwargs): cursor = get_hive_cursor() logging.info(cssql.format(ds=ds)) cursor.execute(cssql.format(ds=ds)) results = cursor.fetchall() mysql_conn = get_db_conn('opay_spread_mysql') mcursor = mysql_conn.cursor() sql_insert = ''' INSERT INTO promoter_order_day ( dt, driver_id, driver_type, name, mobile, city_id, distance, income, online_paid, online_total, total_orders, arrived_orders, total_comments, bad_comments, total_score, online_time ) VALUES ''' sql_ext = ''' ON DUPLICATE KEY UPDATE ''' sql_val = '' sql_count = 0 for driver_id, dt, name, phone, city, type, distance, income, onlineSettlement, onlineTotal, total_orders, arrived_orders, comment, badcomments_num, score, onlinetime in results: sql_tmp = ''' ('{dt}', '{driver_id}', '{driver_type}', '{name}', '{mobile}', '{city_id}', '{distance}', '{income}', '{online_paid}', '{online_total}', '{total_orders}', '{arrived_orders}', '{total_comments}', '{bad_comments}', '{total_score}', '{online_time}') '''.format(dt=dt, driver_id=driver_id, driver_type=type, name=name.replace("\\", "").replace("'", "\\'"), mobile=phone, city_id=city, distance=distance, income=income, online_paid=onlineSettlement, online_total=onlineTotal, total_orders=total_orders, arrived_orders=arrived_orders, total_comments=comment, bad_comments=badcomments_num, total_score=score, online_time=onlinetime) if sql_val == '': sql_val = sql_tmp else: sql_val += ',' + sql_tmp sql_count += 1 if sql_count >= 1000: sql = sql_insert + ' ' + sql_val mcursor.execute(sql) sql_count = 0 sql_val = '' if sql_count > 0: sql = sql_insert + ' ' + sql_val mcursor.execute(sql) mysql_conn.commit() cursor.close() mcursor.close() mysql_conn.close()
def __getOpaySpreadDrivers(): try: citys = __getcityList() logging.info(citys) mysql_conn = get_db_conn('opay_spread_mysql') spread_db = mysql_conn.cursor() msql = ''' select min(if(isnull(gt.team_id), 0, gt.team_id)), min(if(isnull(gt.city), 0, gt.city)), min(if(isnull(gt.team_name), 'other', gt.team_name)), min(if(isnull(gt.group_name), 'other', gt.group_name)), r.driver_id from rider_signups r left join (select t.id as team_id, t.city, t.name as team_name, g.name as group_name from driver_group g left join driver_team t on g.id = t.group_id where g.del = 0 ) gt on gt.team_id = r.team_id where r.driver_id > 0 group by r.driver_id ''' #-- where g.del = 0 and t.del = 0 logging.info(msql) spread_db.execute(msql) results = spread_db.fetchall() #logging.info(results) group_info = { 'team_id': [], 'city': [], 'team_name': [], 'group_name': [], 'driver_id': [] } for (team_id, city, team_name, group_name, driver_id) in results: group_info['team_id'].append(team_id) group_info['city'].append(citys.get(int(city), 'other')) group_info['team_name'].append(team_name) group_info['group_name'].append(group_name) group_info['driver_id'].append(driver_id) #logging.info(pandas.DataFrame(group_info)) spread_db.close() mysql_conn.close() return pandas.DataFrame(group_info) except BaseException as e: logging.info(e) return None
def __getOrideOrders(st, ed): try: mysql_conn = get_db_conn('sqoop_db') oride_db = mysql_conn.cursor() msql = ''' select driver_id, count(distinct if(take_time>={st} and take_time<{ed}, id, null)) as ordertakes, count(distinct if(finish_time>={st} and finish_time<{ed}, id, null)) as orderfinishs, count(distinct if(arrive_time>={st} and arrive_time<{ed}, id, null)) as orderarrives, if(count(distinct if(take_time>={st} and take_time<{ed}, id, null))>0, 1, 0) as drivertakes, if(count(distinct if(finish_time>={st} and finish_time<{ed}, id, null))>0, 1, 0) as driverfinishs, if(count(distinct if(arrive_time>={st} and arrive_time<{ed}, id, null))>0, 1, 0) as driverarrives, if(count(distinct if(arrive_time>={st} and arrive_time<{ed}, id, null))>=5, 1, 0) as driver5arrives from data_order where ((arrive_time >= {st} and arrive_time < {ed}) or (take_time >= {st} and take_time < {ed})) and driver_serv_type = 2 group by driver_id '''.format(st=st, ed=ed) logging.info(msql) oride_db.execute("set time_zone = '+1:00'") oride_db.execute(msql) results = oride_db.fetchall() driver_info = { 'driver_id': [], 'ordertakes': [], 'orderfinishs': [], 'orderarrives': [], 'drivertakes': [], 'driverfinishs': [], 'driverarrives': [], 'driver5arrives': [] } for (driver_id, ordertakes, orderfinishs, orderarrives, drivertakes, driverfinishs, driverarrives, driver5arrives) in results: driver_info['driver_id'].append(driver_id) driver_info['ordertakes'].append(int(ordertakes)) driver_info['orderfinishs'].append(int(orderfinishs)) driver_info['orderarrives'].append(int(orderarrives)) driver_info['drivertakes'].append(int(drivertakes)) driver_info['driverfinishs'].append(int(driverfinishs)) driver_info['driverarrives'].append(int(driverarrives)) driver_info['driver5arrives'].append(int(driver5arrives)) #logging.info(pandas.DataFrame(driver_info)) oride_db.close() mysql_conn.close() return pandas.DataFrame(driver_info) except BaseException as e: logging.info(e) return None
def first_user_data(**op_kwargs): cursor = get_hive_cursor() dt = op_kwargs.get('ds') cursor.execute("SET mapreduce.job.queuename=root.airflow") cursor.execute("SET hive.exec.parallel=true") hql = """ SELECT uc.code, from_unixtime(unix_timestamp(uo.dt,'yyyy-MM-dd'), 'yyyyMMdd') AS day, COUNT(DISTINCT uo.user_id) AS u, unix_timestamp() FROM (SELECT user_id, get_json_object(event_value, '$.bind_refferal_code') AS code FROM oride_dw.dwd_oride_driver_cheating_detection_hi ) AS uc JOIN (SELECT dt, passenger_id as user_id, arrive_time, row_number() over(partition by passenger_id order by arrive_time) orders FROM oride_dw.dwd_oride_order_base_include_test_di WHERE status IN (4,5) AND dt = '{ds}' ) AS uo ON uc.user_id = uo.user_id WHERE uo.orders = 1 and from_unixtime(uo.arrive_time,'yyyy-MM-dd') = '{ds}' GROUP BY uc.code, uo.dt """.format(ds=dt) logging.info(hql) cursor.execute(hql) res = cursor.fetchall() mconn = get_db_conn('opay_spread_mysql') mysql = mconn.cursor() sql = 'insert into promoter_data_day (code, day, pft, create_time) values ' ext = ' on duplicate key update pft=values(pft), create_time=values(create_time)' vals = [] for (c, d, p, t) in res: vals.append("('{c}', '{d}', '{p}', '{t}')".format(c=c, d=d, p=p, t=t)) if len(vals) >= 1000: # logging.info(sql + ",".join(vals) + ext) mysql.execute(sql + ",".join(vals) + ext) vals = [] if len(vals) > 0: # logging.info(sql + ",".join(vals) + ext) mysql.execute(sql + ",".join(vals) + ext) mysql.close() cursor.close()
def create_hive_external_table(db, table, conn, **op_kwargs): sqoopSchema = SqoopSchemaUpdate() response = sqoopSchema.update_hive_schema( hive_db=hive_db, hive_table=hive_table.format(bs=table), mysql_db=db, mysql_table=table, mysql_conn=conn ) #if response: # return True mysql_conn = get_db_conn(conn) mcursor = mysql_conn.cursor() sql = ''' select COLUMN_NAME, DATA_TYPE, COLUMN_COMMENT, COLUMN_TYPE from information_schema.COLUMNS where TABLE_SCHEMA='{db}' and TABLE_NAME='{table}' order by ORDINAL_POSITION '''.format(db=db, table=table) # logging.info(sql) mcursor.execute(sql) res = mcursor.fetchall() # logging.info(res) columns = [] for (name, type, comment, co_type) in res: if type.upper() == 'DECIMAL': columns.append("`%s` %s comment '%s'" % (name, co_type.replace('unsigned', '').replace('signed', ''), comment)) else: columns.append("`%s` %s comment '%s'" % (name, mysql_type_to_hive.get(type.upper(), 'string'), comment)) mysql_conn.close() # 创建hive数据表的sql hql = ods_create_table_hql.format( db_name=hive_db, table_name=hive_table.format(bs=table), columns=",\n".join(columns), hdfs_path=hdfs_path.format(bs=table) ) logging.info(hql) hive_hook = HiveCliHook() logging.info('Executing: %s', hql) hive_hook.run_cli(hql)
def base_data(**op_kwargs): cursor = get_hive_cursor() dt = op_kwargs.get('ds') cursor.execute("SET mapreduce.job.queuename=root.airflow") cursor.execute("SET hive.exec.parallel=true") hql = """ SELECT t.code, from_unixtime(unix_timestamp(dt,'yyyy-MM-dd'), 'yyyyMMdd') as day, COUNT(DISTINCT t.bind_number) as users_count, COUNT(DISTINCT if (length(t.bind_device)>0, t.bind_device, NULL)) as device_count, unix_timestamp() FROM oride_dw.dwd_oride_driver_cheating_detection_hi LATERAL VIEW json_tuple(event_value, 'bind_refferal_code', 'bind_number', 'bind_device_id') t AS code, bind_number, bind_device WHERE dt = '{ds}' GROUP BY t.code, dt """.format(ds=dt) logging.info(hql) cursor.execute(hql) res = cursor.fetchall() mconn = get_db_conn('opay_spread_mysql') mysql = mconn.cursor() sql = 'insert into promoter_data_day (code, day, users_count, device_count, create_time) values ' ext = """ on duplicate key update users_count=values(users_count), device_count=values(device_count), create_time=values(create_time) """ vals = [] for (code, day, users, device, t) in res: vals.append("('{code}', '{day}', '{user}', '{d}', '{t}')".format( code=code, day=day, user=users, d=device, t=t)) if len(vals) >= 1000: # logging.info(sql + ",".join(vals) + ext) mysql.execute(sql + ",".join(vals) + ext) vals = [] if len(vals) > 0: # logging.info(sql + ",".join(vals) + ext) mysql.execute(sql + ",".join(vals) + ext) mysql.close() cursor.close()
def __getcityList(): try: mysql_conn = get_db_conn('sqoop_db') oride_db = mysql_conn.cursor() msql = ''' select id, name from data_city_conf ''' oride_db.execute(msql) results = oride_db.fetchall() citys = {} for (city_id, city_name) in results: citys[city_id] = city_name oride_db.close() mysql_conn.close() return citys except BaseException as e: logging.info(e) return {}
def order_result_to_mysql(ds, **kwargs): cursor = get_hive_cursor() logging.info(promoter_orderoverview_hql.format(ds=ds)) cursor.execute(promoter_orderoverview_hql.format(ds=ds)) results = cursor.fetchall() mysql_conn = get_db_conn('opay_spread_mysql') mcursor = mysql_conn.cursor() sql_insert = 'INSERT INTO promoter_driver_day (day, name, mobile, code, channel, driver_type, firstbill) VALUES' sql_ext = 'ON DUPLICATE KEY UPDATE firstbill = values(firstbill)' sql_val = '' sql_count = 0 for day, driver_type, channel, name, mobile, code, first, ten in results: sql_tmp = "('{day}', '{name}', '{mobile}', '{code}', '{channel}', '{driver_type}', '{firstbill}')".format( day=day, name=name.replace("\\", "").replace("'", "\\'"), mobile=mobile if (len(mobile) < 20) else '', code=code, channel=channel, driver_type=driver_type, firstbill=(first if driver_type == 2 else 0)) if sql_val == '': sql_val = sql_tmp else: sql_val += ',' + sql_tmp sql_count += 1 if sql_count >= 1000: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext mcursor.execute(sql) sql_count = 0 sql_val = '' if sql_count > 0: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext mcursor.execute(sql) mysql_conn.commit() cursor.close() mcursor.close() mysql_conn.close()
def dirver_daily_summary_process(rows, index): logging.info('insert rows num %d, Pid[%d]', index, os.getpid()) db_conn = get_db_conn() db_conn.autocommit(False) db_conn.commit() table = 'data_driver_report' cur = db_conn.cursor() for row in rows: lst = [] for cell in row: lst.append(cell) values = tuple(lst) placeholders = [ "%s", ] * len(values) sql = "INSERT INTO " sql += "{0} VALUES ({1})".format(table, ",".join(placeholders)) cur.execute(sql, values) db_conn.commit() db_conn.close()
def check_ds_data(**op_kwargs): ds = op_kwargs.get('ds', time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))) sql = ''' select count(1) as cnt from bi.ofood_merchant_offline_tag where from_unixtime(update_time,'%Y-%m-%d')='{pt}' '''.format(pt=ds) mysql_conn = get_db_conn('mysql_bi') mcursor = mysql_conn.cursor() mcursor.execute(sql) res = mcursor.fetchall() logging.info(sql) logging.info(res) logging.info(isinstance(res, tuple)) logging.info(len(res)) logging.info(res[0]) if res is None or not isinstance(res, tuple) or len(res) <= 0: comwx.postAppMessage('ofood商家订单指标缺少{}数据, 请及时排查'.format(ds), '271') else: (cnt,) = res[0] logging.info(cnt) if cnt <= 0: comwx.postAppMessage('ofood商家订单指标缺少{}数据, 请及时排查'.format(ds), '271')
def hiveresult_to_mysql(ds, **kwargs): cursor = get_hive_cursor() logging.info(kwargs['sql'].format(ds=ds)) cursor.execute(kwargs['sql'].format(ds=ds)) results = cursor.fetchall() mysql_conn = get_db_conn('opay_spread_mysql') mcursor = mysql_conn.cursor() sql_insert = kwargs['sql_insert'] sql_val = '' sql_ext = kwargs['sql_ext'] sql_count = 0 for day, driver_type, channel, name, mobile, code, drivers in results: sql_tmp = "('{day}', '{name}', '{mobile}', '{code}', '{channel}', '{driver_type}', '{dirvers}')".format( day=day, name=name.replace("\\", "").replace("'", "\\'"), code=code, mobile=mobile if (len(mobile) < 20) else '', channel=channel, driver_type=driver_type, dirvers=drivers) if sql_val == '': sql_val = sql_tmp else: sql_val += ',' + sql_tmp sql_count += 1 if sql_count >= 1000: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext #logging.info(sql) mcursor.execute(sql) sql_count = 0 sql_val = '' if sql_count > 0: sql = sql_insert + ' ' + sql_val + ' ' + sql_ext mcursor.execute(sql) mysql_conn.commit() cursor.close() mcursor.close() mysql_conn.close()
`not_sys_cancel_orders_dserv` int unsigned not null default 0 comment '司机业务应答后取消status = 6 and driver_id > 0 and cancel_role <> 3 and cancel_role <> 4', `picked_orders` int unsigned not null default 0 comment '订单业务成功接驾', `picked_orders_dserv` int unsigned not null default 0 comment '司机业务成接驾', `orders_accept` int unsigned not null default 0 comment '订单业务接单数', `orders_accept_dserv` int unsigned not null default 0 comment '司机业务接单数', `agg_orders_finish` int unsigned not null default 0 comment '订单业务累计完单数', `agg_orders_finish_dserv` int unsigned not null default 0 comment '司机业务累计完单数', primary key (`city_id`,`serv_type`,`order_time`) ) engine=innodb DEFAULT CHARSET=utf8; """, database='bi', mysql_conn_id='mysql_bi', dag=dag ) bidb_conn = get_db_conn('mysql_bi') bidb = bidb_conn.cursor() oridedb_conn = get_db_conn('sqoop_db') driver_type = '-1,0,1,2,99' """ 预插入统计时间节点 @:param op_kwargs """ def preInsertRowPoint(**op_kwargs): test_mode = op_kwargs.get('test_mode', False) if test_mode: ds = op_kwargs.get('ds', time.strftime('%Y-%m-%d', time.localtime(time.time()))) prev_day_start = int(time.mktime(datetime.strptime(ds, '%Y-%m-%d').timetuple())) prev_day_end = prev_day_start + 86400
def get_data_from_hive(ds, execution_date, **op_kwargs): # ds = op_kwargs.get('ds', time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))) hql = ''' SELECT create_date_hour, bd_admin_user_id, bd_admin_user_name, bd_admin_user_mobile, bd_admin_dept_id, bd_admin_job_id, bd_admin_leader_id, audited_agent_cnt, rejected_agent_cnt, ci_suc_order_cnt, ci_suc_order_amt, co_suc_order_cnt, co_suc_order_amt, pos_suc_amt, pos_suc_cnt, country_code, dt, hour from opay_dw.app_opay_bd_agent_report_ng_h where country_code = 'NG' -- 上一个小时 --and concat(dt,' ',hour) >= date_format(default.localTime("{config}", 'NG', '{v_date}', -1), 'yyyy-MM-dd HH') --当前小时 and concat(dt,' ',hour) = date_format(default.localTime("{config}", 'NG', '{v_date}', 0), 'yyyy-MM-dd HH') '''.format(pt=ds, v_date=execution_date.strftime("%Y-%m-%d %H:%M:%S"), table=table_name, db=db_name, config=config) logging.info(hql) hive_cursor = get_hive_cursor() hive_cursor.execute(hql) hive_data = hive_cursor.fetchall() mysql_conn = get_db_conn('app_ali_bi_mysql') mcursor = mysql_conn.cursor() #__data_only_mysql( # mcursor, # execution_date #) __data_to_mysql(mcursor, hive_data, [ 'create_date_hour', 'bd_admin_user_id', 'bd_admin_user_name', 'bd_admin_user_mobile', 'bd_admin_dept_id', 'bd_admin_job_id', 'bd_admin_leader_id', 'audited_agent_cnt', 'rejected_agent_cnt', 'ci_suc_order_cnt', 'ci_suc_order_amt', 'co_suc_order_cnt', 'co_suc_order_amt', 'pos_suc_amt', 'pos_suc_cnt', 'country_code', 'dt', 'hour' ]) hive_cursor.close() mcursor.close()
def get_data_from_impala(**op_kwargs): ds = op_kwargs.get( 'ds', time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))) sql = ''' WITH --司机数据 driver_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, dd.id as driver_id, dd.real_name as driver_name, --司机名字 dd.phone_number as driver_phone, --司机电话 dd.plate_number as driver_bus_number, --车牌号 dd.cycle_id, ---环线代号 cc.name as cycle_name, --所属线路 0 as number_of_seats --座位数 from (select id, real_name, phone_number, plate_number, cycle_id from obus_dw_ods.ods_sqoop_data_driver_df where dt='{pt}' ) as dd left join (select id, `name` from obus_dw_ods.ods_sqoop_conf_cycle_df where dt='{pt}' ) as cc on dd.cycle_id = cc.id ), --工作数据 work_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, dr.id as driver_id, sum(if(dw.serv_mode=1 and dw.serv_mode1=0, round(abs(dw.create_time2-dw.create_time)/3600,2), 0)) as work_dur --司机今日在线时长(小时) from (select driver_id, serv_mode, create_time, lead(serv_mode,1,0) over(partition by driver_id order by create_time) serv_mode1, lead(create_time,1,unix_timestamp('{pt} 23:59:59','yyyy-MM-dd HH:mm:ss')) over(partition by driver_id order by create_time) create_time2 from obus_dw_ods.ods_sqoop_data_driver_work_log_df where dt='{pt}' and from_unixtime(create_time, 'yyyy-MM-dd')='{pt}' ) as dw join (select id from obus_dw_ods.ods_sqoop_data_driver_df where dt='{pt}' ) as dr on dw.driver_id = dr.id group by dr.id ), --订单数据 order_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, driver_id, count(1) as orders, ---本日已经完成的订单数 sum(price) as mtd_gmv_today ---本日累计交易额 from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and from_unixtime(create_time, 'yyyy-MM-dd') = '{pt}' and status in (1,2) group by driver_id ) --结果集 select *, row_number() over(partition by null order by driver_id) num from (select driver_data.dt, driver_data.driver_id, driver_data.driver_name, driver_data.driver_phone, driver_data.driver_bus_number, driver_data.cycle_id, nvl(driver_data.cycle_name, ''), driver_data.number_of_seats, IF(work_data.work_dur IS NULL, 0, work_data.work_dur), IF(order_data.orders IS NULL, 0, order_data.orders), IF(order_data.mtd_gmv_today IS NULL, 0, order_data.mtd_gmv_today) from driver_data left join work_data on driver_data.dt=work_data.dt and driver_data.driver_id=work_data.driver_id left join order_data on driver_data.dt = order_data.dt and driver_data.driver_id = order_data.driver_id ) as t '''.format(pt=ds) logging.info(sql) hive_cursor = get_hive_cursor() hive_cursor.execute(sql) result = hive_cursor.fetchall() mysql_conn = get_db_conn('mysql_bi') mcursor = mysql_conn.cursor() __data_to_mysql( mcursor, result, [ 'dt', 'num', 'driver_id', 'driver_name', 'driver_phone', 'driver_bus_number', 'cycle_id', 'cycle_name', 'number_of_seats', 'mtd_serv_time_today', 'finished_orders_today', 'mtd_gmv_today' ], ''' num=values(num), driver_name=values(driver_name), driver_phone=values(driver_phone), driver_bus_number=values(driver_bus_number), cycle_id=values(cycle_id), cycle_name=values(cycle_name), number_of_seats=values(number_of_seats), mtd_serv_time_today=values(mtd_serv_time_today), finished_orders_today=values(finished_orders_today), mtd_gmv_today=values(mtd_gmv_today) ''') hive_cursor.close() mcursor.close()
def opayspreadCount(**op_kwargs): test_mode = op_kwargs.get('test_mode', False) if test_mode: ds = op_kwargs.get( 'ds', time.strftime('%Y-%m-%d', time.localtime(time.time()))) prev_day_start = int( time.mktime(datetime.strptime(ds, '%Y-%m-%d').timetuple())) else: prev_timepoint = math.floor(int(time.time()) / 600) * 600 - 600 prev_day_start = math.floor(prev_timepoint / 86400) * 86400 prev_day_end = prev_day_start + 86400 bidbconn = get_db_conn('mysql_bi') bidb = bidbconn.cursor() driver_orders = __getOrideOrders(prev_day_start, prev_day_end) driver_framework = __getOpaySpreadDrivers() if driver_orders is None or driver_framework is None: raise ValueError('get orders or groups error') #results = pandas.merge(driver_orders, driver_framework, on='driver_id') results = driver_framework.merge(driver_orders, how='left', on=['driver_id']) #logging.info(results.tail) #按城市、group汇总数据 group_results = results.groupby(['city', 'group_name']).agg( OrderedDict([('city', 'min'), ('group_name', 'min'), ('driver_id', 'count'), ('team_id', 'nunique'), ('ordertakes', 'sum'), ('orderfinishs', 'sum'), ('orderarrives', 'sum'), ('drivertakes', 'sum'), ('driverfinishs', 'sum'), ('driverarrives', 'sum'), ('driver5arrives', 'sum')])) #logging.info(group_results) #保存结果到数据库 __dataToMysql( time.strftime('%Y-%m-%d 00:00:00', time.localtime(prev_day_start)), bidb, group_results.values.tolist(), [ 'daily', 'city', 'group_name', 'drivers', 'teams', 'ordertakes', 'orderfinishs', 'orderarrives', 'drivertakes', 'driverfinishs', 'driverarrives', 'driver5arrives' ], ''' teams=values(teams), drivers=values(drivers), ordertakes=values(ordertakes), orderfinishs=values(orderfinishs), orderarrives=values(orderarrives), drivertakes=values(drivertakes), driverfinishs=values(driverfinishs), driverarrives = values(driverarrives), driver5arrives=values(driver5arrives) ''') team_results = results.groupby(['city', 'group_name', 'team_id']).agg( OrderedDict([('city', 'min'), ('group_name', 'min'), ('team_id', 'min'), ('team_name', 'max'), ('driver_id', 'count'), ('ordertakes', 'sum'), ('orderfinishs', 'sum'), ('orderarrives', 'sum'), ('drivertakes', 'sum'), ('driverfinishs', 'sum'), ('driverarrives', 'sum'), ('driver5arrives', 'sum')])) #logging.info(team_results) #保存结果到数据库 __dataToMysql( time.strftime('%Y-%m-%d 00:00:00', time.localtime(prev_day_start)), bidb, team_results.values.tolist(), [ 'daily', 'city', 'group_name', 'team_id', 'team_name', 'drivers', 'ordertakes', 'orderfinishs', 'orderarrives', 'drivertakes', 'driverfinishs', 'driverarrives', 'driver5arrives' ], ''' drivers=values(drivers), ordertakes=values(ordertakes), orderfinishs=values(orderfinishs), orderarrives=values(orderarrives), drivertakes=values(drivertakes), driverfinishs=values(driverfinishs), driverarrives = values(driverarrives), driver5arrives=values(driver5arrives) ''') bidbconn.close()
def data_monitor(**op_kwargs): time.sleep(300) prev_timepoint = math.floor(int(time.time()) / 600) * 600 - 600 prev_timestr = time.strftime('%Y-%m-%d %H:%M:00', time.localtime(prev_timepoint)) bidbconn = get_db_conn('mysql_bi') oride_db_conn = get_db_conn('sqoop_db') #查询城市列表 city_sql = ''' select count(distinct id) from data_city_conf where id < 999000 ''' oridedb = oride_db_conn.cursor() oridedb.execute(city_sql) results = oridedb.fetchone() (city_cnt, ) = results total_count = (int(city_cnt) + 1) * 5 comwx = ComwxApi('wwd26d45f97ea74ad2', 'BLE_v25zCmnZaFUgum93j3zVBDK-DjtRkLisI_Wns4g', '1000011') #查询当前点数据指标总数 metrics_sql = ''' select city_id, city_name, serv_type, order_time, (orders+orders_user+orders_pick+drivers_serv+drivers_orderable+orders_finish+ avg_pick+avg_take+not_sys_cancel_orders+picked_orders+orders_accept+agg_orders_finish) as total from oride_orders_status_10min where order_time = '{}' '''.format(prev_timestr) bidb = bidbconn.cursor() logging.info(metrics_sql) bidb.execute(metrics_sql) results = bidb.fetchall() metrics_cnt = 0 for (city_id, city_name, serv_type, order_time, total) in results: if city_id >= 999000: continue metrics_cnt += 1 if city_id == 0 and serv_type == -1 and total <= 0: comwx.postAppMessage( '{0}[{1}]10分钟数据{2}数据记录指标全部为0异常,请及时排查,谢谢'.format( city_name, serv_type, order_time), '271') return if metrics_cnt < total_count: comwx.postAppMessage( '10分钟数据{0}数据记录缺失异常({1}<{2}),请及时排查,谢谢'.format( prev_timestr, metrics_cnt, total_count), '271') return #检查上2个时间点数据 与 一周前相同时间点对比差异 weekly_diff = ''' select t1.city_id, t1.city_name, t1.serv_type, t1.order_time, t1.orders as t1orders, if(isnull(t2.orders) or t2.orders<=0, 0, t2.orders) as t2orders, t1.orders_user as t1ousers, if(isnull(t2.orders_user) or t2.orders_user<=0, 0, t2.orders_user) as t2ousers, t1.orders_pick as t1opicks, if(isnull(t2.orders_pick) or t2.orders_pick<=0, 0, t2.orders_pick) as t2opicks, t1.drivers_serv as t1dservs, if(isnull(t2.drivers_serv) or t2.drivers_serv<=0, 0, t2.drivers_serv) as t2dservs, t1.drivers_orderable as t1doables, if(isnull(t2.drivers_orderable) or t2.drivers_orderable<=0, 0, t2.drivers_orderable) as t2doables, t1.orders_finish as t1ofs, if(isnull(t2.orders_finish) or t2.orders_finish<=0, 0, t2.orders_finish) as t2ofs, t1.avg_pick as t1apicks, if(isnull(t2.avg_pick) or t2.avg_pick<=0, 0, t2.avg_pick) as t2apicks, t1.avg_take as t1atakes, if(isnull(t2.avg_take) or t2.avg_take<=0, 0, t2.avg_take) as t2atakes, t1.not_sys_cancel_orders as t1norders, if(isnull(t2.not_sys_cancel_orders) or t2.not_sys_cancel_orders<=0, 0, t2.not_sys_cancel_orders) as t2norders, t1.picked_orders as t1pos, if(isnull(t2.picked_orders) or t2.picked_orders<=0, 0, t2.picked_orders) as t2pos, t1.agg_orders_finish as t1aofs, if(isnull(t2.agg_orders_finish) or t2.agg_orders_finish<=0, 0, t2.agg_orders_finish) as t2aofs from (select * from oride_orders_status_10min where order_time>=from_unixtime({dsb2})) t1 left join (select * from oride_orders_status_10min where order_time>=from_unixtime({dsb7}) and order_time<=from_unixtime({dsb7a3})) t2 on t1.city_id = t2.city_id and t1.serv_type = t2.serv_type and t1.order_time = date_format(from_unixtime(unix_timestamp(t2.order_time)+86400*7), '%Y-%m-%d %H:%i:00') '''.format(dsb2=prev_timepoint - 1200, dsb7=prev_timepoint - 1200 - 86400 * 7, dsb7a3=prev_timepoint - 86400 * 7) logging.info(weekly_diff) bidb.execute(weekly_diff) results = bidb.fetchall() for (city_id, city_name, serv_type, order_time, t1orders, t2orders, t1ousers, t2ousers, t1opicks, t2opicks, t1dservs, t2dservs, t1doables, t2doables, t1ofs, t2ofs, t1apicks, t2apicks, t1atakes, t2atakes, t1norders, t2norders, t1pos, t2pos, t1aofs, t2aofs) in results: if serv_type == -1 and ((t2orders >= 100 and t2orders > t1orders and (t2orders - t1orders)/t2orders > 0.8) or \ (t2orders > 0 and t2orders < 100 and (t2orders - t1orders) > 40)): comwx.postAppMessage( '{0}[{1}]10分钟数据{2}下单数记录与上周同期对比异常,请及时排查,谢谢'.format( city_name, serv_type, order_time), '271') return if serv_type == -1 and ((t2dservs >= 200 and t2dservs > t1dservs and (t2dservs - t1dservs)/t2dservs > 0.8) or \ (t2dservs > 0 and t2dservs < 100 and (t2dservs - t1dservs) > 80)): comwx.postAppMessage( '{0}[{1}]10分钟数据{2}在线司机数记录与上周同期对比异常,请及时排查,谢谢'.format( city_name, serv_type, order_time), '271') return if serv_type == -1 and ((t2doables >= 200 and t2doables > t1doables and (t2doables - t1doables)/t2doables > 0.8) or \ (t2doables > 0 and t2doables < 100 and (t2doables - t1doables) > 80)): comwx.postAppMessage( '{0}[{1}]10分钟数据{2}可接单司机数记录与上周同期对比异常,请及时排查,谢谢'.format( city_name, serv_type, order_time), '271') return
def get_data_from_impala(**op_kwargs): ds = op_kwargs.get('ds', time.strftime('%Y-%m-%d', time.localtime(time.time()-86400))) sql = ''' WITH --分城市 cycle_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, cy.city_id, count(distinct cy.id) as total_lines, --总线路数 count(distinct dr.id) as total_drivers, --线路总司机数 count(distinct if(serv_mode='1', dr.id, null)) as serv_drivers, --线路上司机数量 count(distinct if(serv_mode='0', dr.id, null)) as no_serv_drivers --线路下司机数量 from (select cycle_id, id, serv_mode from obus_dw_ods.ods_sqoop_data_driver_df where dt='{pt}' and from_unixtime(login_time, 'yyyy-MM-dd') = '{pt}' ) as dr inner join (select id, city_id from obus_dw_ods.ods_sqoop_conf_cycle_df where dt='2019-08-17' and status = '0' ) as cy on dr.cycle_id = cy.id group by cy.city_id ), --不分城市 cycle_data_all as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, 0 as city_id, count(distinct cy.id) as total_lines, --总线路数 count(distinct dr.id) as total_drivers, --线路总司机数 count(distinct if(serv_mode=1, dr.id, null)) as serv_drivers, --线路上司机数量 count(distinct if(serv_mode=0, dr.id, null)) as no_serv_drivers --线路下司机数量 from (select cycle_id, id, serv_mode from obus_dw_ods.ods_sqoop_data_driver_df where dt='{pt}' and from_unixtime(login_time, 'yyyy-MM-dd') = '{pt}' ) as dr inner join (select id, city_id from obus_dw_ods.ods_sqoop_conf_cycle_df where dt='2019-08-17' and status = 0 ) as cy on dr.cycle_id = cy.id ), --分城市 order_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, count(1) as line_orders, --线路总下单数 sum(if(status in (1,2), 1, 0)) as line_finished_orders, --线路总完单数 sum(if(status in (1,2), price, 0)) as line_gmv --线路收益 from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and from_unixtime(cast(create_time as bigint), 'yyyy-MM-dd') = '{pt}' group by city_id ), --不分城市 order_data_all as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, 0 as city_id, count(1) as line_orders, sum(if(status in (1,2), 1, 0)) as line_finished_orders, sum(if(status in (1,2), price, 0)) as line_gmv from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and from_unixtime(cast(create_time as bigint), 'yyyy-MM-dd') = '{pt}' ), --分城市 station_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, count(distinct id) as total_stations --总站点数 from obus_dw_ods.ods_sqoop_conf_station_df where dt='{pt}' group by city_id ), --不分城市 station_data_all as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, 0 as city_id, count(distinct id) as total_stations from obus_dw_ods.ods_sqoop_conf_station_df where dt='{pt}' ), --分城市 users_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, count(1) as users --新用户数量 from (select city_id, user_id, create_time, row_number() over(partition by user_id order by arrive_time) orders from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and status in (1,2) and user_id > 0 ) as t where from_unixtime(t.create_time, 'yyyy-MM-dd')='{pt}' and orders=1 group by t.city_id ), --不分城市 users_data_all as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, 0 as city_id, count(1) as users --新用户数量 from (select city_id, user_id, create_time, row_number() over(partition by user_id order by arrive_time) orders from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and status in (1,2) and user_id > 0 ) as t where from_unixtime(t.create_time, 'yyyy-MM-dd')='{pt}' and orders=1 ), --分城市 app_users_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, do.city_id, sum(if(dp.mode=1 and do.orders=1, 1, 0)) as obusapp_new_users, ---ObusAPP新用户数量 count(distinct if(dp.mode=1, do.user_id, null)) as money_ballet_users --今日钱包使用人数 from (select id, city_id, create_time, user_id, row_number() over(partition by user_id order by arrive_time) orders from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and status in (1,2) and user_id > 0 ) as do join (select id, mode from obus_dw_ods.ods_sqoop_data_order_payment_df where dt='{pt}' and from_unixtime(create_time, 'yyyy-MM-dd')='{pt}' ) as dp on do.id = dp.id where from_unixtime(do.create_time, 'yyyy-MM-dd') = '{pt}' group by do.city_id ), --不分城市 app_users_data_all as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, 0 as city_id, sum(if(dp.mode=1 and do.orders=1, 1, 0)) as obusapp_new_users, ---ObusAPP新用户数量 count(distinct if(dp.mode=1, do.user_id, null)) as money_ballet_users --今日钱包使用人数 from (select id, city_id, create_time, user_id, row_number() over(partition by user_id order by arrive_time) orders from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and status in (1,2) and user_id > 0 ) as do join (select id, mode from obus_dw_ods.ods_sqoop_data_order_payment_df where dt='{pt}' and from_unixtime(create_time, 'yyyy-MM-dd')='{pt}' ) as dp on do.id = dp.id where from_unixtime(do.create_time, 'yyyy-MM-dd') = '{pt}' ), --分城市 app_ticket_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, do.city_id, sum(if(dp.mode=2 and do.orders=1, 1, 0)) as ticket_new_users ---首次使用公交卡新用户数量 from (select id, city_id, create_time, ticket_id, row_number() over(partition by ticket_id order by arrive_time) orders from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and status in (1,2) and ticket_id > 0 ) as do join (select id, ticket_id, mode from obus_dw_ods.ods_sqoop_data_order_payment_df where dt='{pt}' and from_unixtime(create_time, 'yyyy-MM-dd')='{pt}' ) as dp on do.id = dp.id and do.ticket_id = dp.ticket_id where from_unixtime(do.create_time, 'yyyy-MM-dd') = '{pt}' group by do.city_id ), --不分城市 app_ticket_data_all as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, 0 as city_id, sum(if(dp.mode=2 and do.orders=1, 1, 0)) as ticket_new_users ---首次使用公交卡新用户数量 from (select id, city_id, create_time, ticket_id, row_number() over(partition by ticket_id order by arrive_time) orders from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and status in (1,2) and ticket_id > 0 ) as do join (select id, ticket_id, mode from obus_dw_ods.ods_sqoop_data_order_payment_df where dt='{pt}' and from_unixtime(create_time, 'yyyy-MM-dd')='{pt}' ) as dp on do.id = dp.id and do.ticket_id = dp.ticket_id where from_unixtime(do.create_time, 'yyyy-MM-dd') = '{pt}' ), --分城市 recharge_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, du.city_id, count(distinct if(rc.status=1 and from_unixtime(rc.create_time,'yyyy-MM-dd')='{pt}', rc.user_id, null)) as recharge_users, --用户钱包充值人数 count(distinct rc.user_id) as online_uv, --用户钱包总数量=线上uv sum(if(rc.status=1 and rc.recharge=1 and from_unixtime(rc.create_time,'yyyy-MM-dd')='{pt}', 1, 0)) as money_ballet_recharge_users --今日钱包新充值人数 from (select user_id, status, create_time, row_number() over(partition by user_id order by create_time) recharge from obus_dw_ods.ods_sqoop_data_user_recharge_df where dt='{pt}' and user_id > 0 ) as rc join (select city_id, id from obus_dw_ods.ods_sqoop_data_user_df where dt='{pt}' ) as du on rc.user_id = du.id group by du.city_id ), --不分城市 recharge_data_all as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, 0 as city_id, count(distinct if(rc.status=1 and from_unixtime(rc.create_time,'yyyy-MM-dd')='{pt}', rc.user_id, null)) as recharge_users, --用户钱包充值人数 count(distinct rc.user_id) as online_uv, --用户钱包总数量=线上uv sum(if(rc.status=1 and rc.recharge=1 and from_unixtime(rc.create_time,'yyyy-MM-dd')='{pt}', 1, 0)) as money_ballet_recharge_users --今日钱包新充值人数 from (select user_id, status, create_time, row_number() over(partition by user_id order by create_time) recharge from obus_dw_ods.ods_sqoop_data_user_recharge_df where dt='{pt}' and user_id > 0 ) as rc ), --分城市 ticket_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, count(1) as tied_tickets --绑卡数 from obus_dw_ods.ods_sqoop_data_ticket_df where dt='{pt}' and status=0 and from_unixtime(bind_time, 'yyyy-MM-dd') = '{pt}' group by city_id ), --不分城市 ticket_data_all as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, 0 as city_id, count(1) as tied_tickets --绑卡数 from obus_dw_ods.ods_sqoop_data_ticket_df where dt='{pt}' and status=0 and from_unixtime(bind_time, 'yyyy-MM-dd') = '{pt}' ) --结果集 select cycle_data.dt, cycle_data.city_id, nvl(dc.name,''), cycle_data.total_lines, cycle_data.total_drivers, cycle_data.serv_drivers, cycle_data.no_serv_drivers, IF(order_data.line_orders IS NULL, 0, order_data.line_orders), IF(order_data.line_finished_orders IS NULL, 0, order_data.line_finished_orders), IF(order_data.line_gmv IS NULL, 0, order_data.line_gmv), IF(station_data.total_stations IS NULL, 0, station_data.total_stations), IF(users_data.users IS NULL, 0, users_data.users), IF(app_users_data.obusapp_new_users IS NULL, 0, app_users_data.obusapp_new_users), IF(app_ticket_data.ticket_new_users IS NULL, 0, app_ticket_data.ticket_new_users), IF(app_users_data.money_ballet_users IS NULL, 0, app_users_data.money_ballet_users), IF(recharge_data.recharge_users IS NULL, 0, recharge_data.recharge_users), IF(recharge_data.online_uv IS NULL, 0, recharge_data.online_uv), IF(recharge_data.money_ballet_recharge_users IS NULL, 0, recharge_data.money_ballet_recharge_users), IF(ticket_data.tied_tickets IS NULL, 0, ticket_data.tied_tickets) from (select * from cycle_data union select * from cycle_data_all) as cycle_data left join (select * from order_data union select * from order_data_all) as order_data on cycle_data.dt = order_data.dt and cycle_data.city_id=order_data.city_id left join (select * from station_data union select * from station_data_all) as station_data on station_data.dt = cycle_data.dt and station_data.city_id = cycle_data.city_id left join (select * from users_data union select * from users_data_all) as users_data on users_data.dt = cycle_data.dt and users_data.city_id = cycle_data.city_id left join (select * from app_users_data union select * from app_users_data_all) as app_users_data on app_users_data.dt = cycle_data.dt and app_users_data.city_id = cycle_data.city_id left join (select * from recharge_data union select * from recharge_data_all) as recharge_data on recharge_data.dt = cycle_data.dt and recharge_data.city_id = cycle_data.city_id left join (select * from ticket_data union select * from ticket_data_all) as ticket_data on ticket_data.dt = cycle_data.dt and ticket_data.city_id = cycle_data.city_id left join (select * from app_ticket_data union select * from app_ticket_data_all) as app_ticket_data on app_ticket_data.dt = cycle_data.dt and app_ticket_data.city_id = cycle_data.city_id left join (select id, name from obus_dw_ods.ods_sqoop_conf_city_df where dt='{pt}' and validate=1) as dc on cycle_data.city_id = dc.id '''.format( pt=ds ) logging.info(sql) hive_cursor = get_hive_cursor() hive_cursor.execute(sql) result = hive_cursor.fetchall() mysql_conn = get_db_conn('mysql_bi') mcursor = mysql_conn.cursor() __data_to_mysql(mcursor, result, ['dt','city_id','city','total_lines_double','total_drivers','serv_drivers', 'no_serv_drivers','lines_orders_double','lines_finished_orders_double', 'line_gmv_double','total_stations','new_users','obusapp_new_users','ticket_new_users', 'money_ballet_users','recharge_users','online_uv','money_ballet_recharge_users','tied_cards'], ''' total_lines_double=values(total_lines_double), total_drivers=values(total_drivers), serv_drivers=values(serv_drivers), no_serv_drivers=values(no_serv_drivers), lines_orders_double=values(lines_orders_double), lines_finished_orders_double=values(lines_finished_orders_double), total_stations=values(total_stations), line_gmv_double=values(line_gmv_double), new_users=values(new_users), obusapp_new_users=values(obusapp_new_users), ticket_new_users=values(ticket_new_users), recharge_users=values(recharge_users), online_uv=values(online_uv), money_ballet_users=values(money_ballet_users), tied_cards=values(tied_cards), money_ballet_recharge_users=values(money_ballet_recharge_users) ''' ) hive_cursor.close() mcursor.close()
def get_data_from_impala(**op_kwargs): ds = op_kwargs.get('ds', time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))) sql = ''' WITH --线路数据 line_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, id, name --line_name from obus_dw_ods.ods_sqoop_conf_line_df where dt='{pt}' ), --站点数据 station_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, cs.city_id, cls.line_id, cs.id, --站点ID cs.name --站点名 from (select id, city_id, name from obus_dw_ods.ods_sqoop_conf_station_df where dt='{pt}' ) as cs left join (select line_id, station_id from obus_dw_ods.ods_sqoop_conf_line_stations_df where dt='{pt}') as cls on cs.id = cls.station_id ), --司机数据 driver_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, cl.city_id, cl.id, cl.start_station, count(1) as total_drivers, --司机总人数 sum(if(dd.serv_mode=1, 1, 0)) as serv_drivers, --上班司机数 sum(if(dd.serv_status=1 and dd.serv_mode=1, 1, 0)) as serv_on_the_road_drivers, --上班行驶司机数 sum(if(dd.serv_mode=1 and dd.serv_status in (0,2), 1, 0)) as serv_idle_drivers, --上班未行驶司机数 sum(if(dd.serv_mode=0, 1, 0)) as no_serv_drivers --下班司机数 from (select id, city_id, start_station from obus_dw_ods.ods_sqoop_conf_line_df where dt='{pt}' ) as cl join (select id, line_id, serv_mode, serv_status from obus_dw_ods.ods_sqoop_data_driver_df where dt='{pt}' and from_unixtime(login_time, 'yyyy-MM-dd')='{pt}' ) as dd on cl.id = dd.line_id group by cl.city_id, cl.id, cl.start_station ), --线路订单数据 line_order_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, line_id, count(1) as lines_orders, ---线路总订单数 sum(if(status in (1,2), 1, 0)) as lines_finished_orders, ---线路总完单数 sum(if(status in (1,2), price, 0)) as line_gmv_single --线路收益(单) from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and from_unixtime(create_time, 'yyyy-MM-dd') = '{pt}' group by city_id, line_id ), --站点订单数据 station_order_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, line_id, start_station_id, count(1) as station_orders, --分站点订单数 sum(if(status in (1,2), 1, 0)) as station_finished_orders, ---分站点完单数 count(distinct if(start_station_id>0 and status in (0,1,2), user_id, null)) as get_on_users --上车乘客数 from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and from_unixtime(create_time, 'yyyy-MM-dd') = '{pt}' group by city_id, line_id, start_station_id ), --新用户数量 new_users_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, line_id, start_station_id, sum(if(orders=1, 1, 0)) as new_users --新用户数量 from (select city_id, line_id, start_station_id, create_time, user_id, row_number() over(partition by user_id order by arrive_time) orders from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and status in (1,2) and user_id>0 ) as t where from_unixtime(create_time,'yyyy-MM-dd')='{pt}' group by city_id, line_id, start_station_id ), --下车乘客数 get_off_users as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, line_id, end_station_id, count(distinct if(end_station_id>0 and status in (0,1,2), user_id, null)) as get_off_users --下车乘客数 from obus_dw_ods.ods_sqoop_data_order_df where dt='{pt}' and from_unixtime(create_time, 'yyyy-MM-dd') = '{pt}' group by city_id, line_id, end_station_id ) --结果集 select station_data.dt, station_data.city_id, nvl(dc.name, ''), IF(station_data.line_id IS NULL, 0, station_data.line_id), nvl(line_data.name, ''), station_data.id, station_data.name, IF(driver_data.total_drivers IS NULL, 0, driver_data.total_drivers), IF(driver_data.serv_drivers IS NULL, 0, driver_data.serv_drivers), IF(driver_data.serv_on_the_road_drivers IS NULL, 0, driver_data.serv_on_the_road_drivers), IF(driver_data.serv_idle_drivers IS NULL, 0, driver_data.serv_idle_drivers), IF(driver_data.no_serv_drivers IS NULL, 0, driver_data.no_serv_drivers), IF(line_order_data.lines_orders IS NULL, 0, line_order_data.lines_orders), IF(station_order_data.station_orders IS NULL, 0, station_order_data.station_orders), IF(line_order_data.lines_finished_orders IS NULL, 0, line_order_data.lines_finished_orders), IF(station_order_data.station_finished_orders IS NULL, 0, station_order_data.station_finished_orders), IF(new_users_data.new_users IS NULL, 0, new_users_data.new_users), IF(station_order_data.get_on_users IS NULL, 0, station_order_data.get_on_users), IF(get_off_users.get_off_users IS NULL, 0, get_off_users.get_off_users), IF(line_order_data.line_gmv_single IS NULL, 0, line_order_data.line_gmv_single) from station_data left join line_data on station_data.dt=line_data.dt and station_data.city_id=line_data.city_id and station_data.line_id=line_data.id left join driver_data on station_data.dt = driver_data.dt and station_data.city_id = driver_data.city_id and station_data.line_id = driver_data.id and station_data.id = driver_data.start_station left join line_order_data on station_data.dt = line_order_data.dt and station_data.city_id = line_order_data.city_id and station_data.line_id = line_order_data.line_id left join station_order_data on station_data.dt = station_order_data.dt and station_data.city_id = station_order_data.city_id and station_data.line_id = station_order_data.line_id and station_data.id = station_order_data.start_station_id left join new_users_data on station_data.dt = new_users_data.dt and station_data.city_id = new_users_data.city_id and station_data.line_id = new_users_data.line_id and station_data.id = new_users_data.start_station_id left join get_off_users on station_data.dt = get_off_users.dt and station_data.city_id = get_off_users.city_id and station_data.line_id = get_off_users.line_id and station_data.id = get_off_users.end_station_id left join (select id, name from obus_dw_ods.ods_sqoop_conf_city_df where dt='{pt}' and validate=1) as dc on station_data.city_id = dc.id '''.format( pt=ds ) logging.info(sql) hive_cursor = get_hive_cursor() hive_cursor.execute(sql) result = hive_cursor.fetchall() mysql_conn = get_db_conn('mysql_bi') mcursor = mysql_conn.cursor() __data_to_mysql(mcursor, result, ['dt', 'city_id', 'city', 'line_id', 'line_name', 'station_id', 'station_name', 'total_drivers', 'serv_drivers', 'serv_on_the_road_drivers', 'serv_idle_drivers', 'no_serv_drivers', 'lines_orders', 'station_orders', 'lines_finished_orders', 'station_finished_orders', 'new_users', 'get_on_users', 'get_off_users', 'line_gmv_single'], ''' total_drivers=values(total_drivers), serv_drivers=values(serv_drivers), serv_on_the_road_drivers=values(serv_on_the_road_drivers), serv_idle_drivers=values(serv_idle_drivers), no_serv_drivers=values(no_serv_drivers), lines_orders=values(lines_orders), station_orders=values(station_orders), lines_finished_orders=values(lines_finished_orders), station_finished_orders=values(station_finished_orders), new_users=values(new_users), get_on_users=values(get_on_users), get_off_users=values(get_off_users), line_gmv_single=values(line_gmv_single) ''' ) hive_cursor.close() mcursor.close()
def init_mysql_table(**op_kwargs): hive_cursor = get_hive_cursor('hiveserver2_default') hive_db = op_kwargs.get('db') hive_table = op_kwargs.get('table') mysql_cursor = op_kwargs.get('mysql_conn') dt = op_kwargs.get('ds') overwrite = op_kwargs.get('overwrite') hive_columns = get_hive_table_columns(hive_cursor, hive_db, hive_table) cols = [] mcols = [] for v in hive_columns: if "int" in v['type']: cols.append("if(`{}` is NULL, 0, `{}`)".format( v['name'].lower(), v['name'].lower())) elif v['type'] == 'float' or v['type'] == 'double' or v[ 'type'] == 'decimal': cols.append("if(`{}` is NULL, '0.00', `{}`)".format( v['name'].lower(), v['name'].lower())) elif v['type'] == 'array' or v['type'] == 'map' or v[ 'type'] == 'struct': cols.append("''") else: cols.append("if(`{}` is NULL, '', `{}`)".format( v['name'].lower(), v['name'].lower())) mcols.append(v['name'].lower()) new_table = create_bi_mysql_table(mysql_cursor, hive_db, hive_table, hive_columns) if new_table: # 新表 全量 hql = ''' SELECT {cols} FROM {db}.{table} '''.format(db=hive_db, table=hive_table, cols=",".join(cols)) else: # 增量 hql = ''' SELECT {cols} FROM {db}.{table} WHERE dt = '{dt}' '''.format(db=hive_db, table=hive_table, cols=",".join(cols), dt=dt) logging.info(hql) wxapi = ComwxApi('wwd26d45f97ea74ad2', 'BLE_v25zCmnZaFUgum93j3zVBDK-DjtRkLisI_Wns4g', '1000011') try: mconn = get_db_conn(mysql_cursor) mcursor = mconn.cursor() # mysql_connectors[mysql_cursor] if overwrite: mcursor.execute("TRUNCATE TABLE {db}.{table}".format( db=hive_db, table=hive_table)) else: mcursor.execute( "DELETE FROM {db}.{table} WHERE dt = '{dt}'".format( db=hive_db, table=hive_table, dt=dt)) isql = 'replace into {db}.{table} (`{cols}`) values '.format( db=hive_db, table=hive_table, cols='`,`'.join(mcols)) hive_cursor.execute(hql) rows = [] cnt = 0 while True: try: record = hive_cursor.next() except: record = None # logging.info(record) if not record: break rows.append("('{}')".format("','".join([ str(MySQLdb.escape_string(str(x)), encoding="utf-8") for x in record ]))) # logging.info(rows) cnt += 1 if cnt >= 1000: logging.info(cnt) mcursor.execute("{h} {v}".format(h=isql, v=",".join(rows))) cnt = 0 rows = [] # logging.info(rows) if cnt > 0: logging.info("last: {}".format(cnt)) mcursor.execute("{h} {v}".format(h=isql, v=",".join(rows))) mcursor.close() hive_cursor.close() except BaseException as e: logging.info(e) mcursor.close() hive_cursor.close() wxapi.postAppMessage( '重要重要重要:{}.{}数据写入mysql异常【{}】'.format(hive_db, hive_table, dt), '271')
def query_data(**op_kwargs): dt = op_kwargs.get('ds') cursor = get_hive_cursor() cursor.execute("set hive.execution.engine=tez") repair_table_names = [ "data_driver_extend", "data_driver_reward", "data_order", "data_order_payment", "data_user_extend", "user_action", "client_event" ] for name in repair_table_names: print(name) db_name = "oride_source." if name.startswith("data"): db_name = "oride_db." cursor.execute(repair_table_query % (db_name + name)) cursor.execute(query1.format(dt=dt)) res1 = cursor.fetchall() res1 = map(mapper, list(res1[0])) [ call_num, success_num, gmv, cancel_before_dispatching_num, cancel_after_dispatching_by_user_num, cancel_after_dispatching_by_driver_num, pickup_num, pickup_total_time, take_num, take_total_time, total_driver_price ] = res1 print(1) cursor.execute(query2.format(dt=dt)) res2 = cursor.fetchall() res2 = map(mapper, list(res2[0])) [pay_num, total_price, total_c_discount, offline_num] = res2 print(2) cursor.execute(query4.format(dt=dt)) res4 = cursor.fetchall() res4 = map(mapper, list(res4[0])) [call_user_num, finished_user_num, new_finished_user_num] = res4 print(4) cursor.execute(query5.format(dt=dt)) res5 = cursor.fetchall() res5 = map(mapper, list(res5[0])) [total_driver_num, login_driver_num, new_driver_num] = res5 print(5) cursor.execute(query6.format(dt=dt)) res6 = cursor.fetchall() res6 = map(mapper, list(res6[0])) [order_driver_num, finished_driver_num, new_finished_driver_num] = res6 print(6) cursor.execute(query7.format(dt=dt)) res7 = cursor.fetchall() res7 = map(mapper, list(res7[0])) [bubble_num] = res7 print(7) cursor.execute(query9.format(dt=dt)) res9 = cursor.fetchall() res9 = map(mapper, list(res9[0])) [new_passenger_num] = res9 print(9) (transport_efficiency, avg_order_per_driver, online_driver_num) = get_driver_data(dt) print(10) data = [ success_num, success_num / float(call_num) if call_num > 0 else 0, bubble_num, call_num, call_num / float(bubble_num) if bubble_num > 0 else 0, online_driver_num, order_driver_num, round(float(gmv), 2), round(float(gmv) / float(success_num) if success_num > 0 else 0, 2), round(float(total_driver_price), 2), round(float(total_c_discount), 2), round( float(total_driver_price) / float(success_num) if success_num > 0 else 0, 2), round( float(total_c_discount) / float(success_num) if success_num > 0 else 0, 2), float(total_driver_price + total_c_discount) / float(total_price) if total_price > 0 else 0, cancel_before_dispatching_num / float(call_num) if call_num > 0 else 0, cancel_after_dispatching_by_user_num / float(call_num) if call_num > 0 else 0, cancel_after_dispatching_by_driver_num / float(call_num) if call_num > 0 else 0, round( pickup_total_time / float(pickup_num * 60) if pickup_num > 0 else 0, 2), round(take_total_time / float(take_num) if take_num > 0 else 0, 2), total_driver_num, new_driver_num, finished_driver_num, new_finished_driver_num, new_finished_driver_num / float(finished_driver_num) if finished_driver_num > 0 else 0, call_user_num, finished_user_num, new_passenger_num, new_finished_user_num, new_finished_user_num / float(finished_user_num) if finished_driver_num > 0 else 0, new_finished_user_num / new_passenger_num if new_passenger_num > 0 else 0, pay_num - offline_num, offline_num, transport_efficiency, 0, avg_order_per_driver ] insert_data = [None, dt] + data sql_conn = get_db_conn() sql_cursor = sql_conn.cursor() sql_cursor.execute(INSERT_SQL, insert_data)
def create_bi_mysql_table(conn, db, table, columns): #if conn not in mysql_connectors: mconn = get_db_conn(conn) # mysql_connectors[conn] = mconn.cursor() #mcursor = mysql_connectors[conn] mcursor = mconn.cursor() sql = ''' SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS WHERE TABLE_SCHEMA='{db}' AND TABLE_NAME='{table}' ORDER BY ORDINAL_POSITION '''.format(db=db, table=table) mcursor.execute(sql) res = mcursor.fetchall() # mysql表不存在 if len(res) <= 0: cols = [] for v in columns: types = type_map.get(v['type'].lower().strip(), { "type": "varchar", "ext": "(255) not null default ''" }) cols.append("`{name}` {type}{ext} comment '{comment}'".format( name=v['name'], type=types['type'], ext=types['ext'], comment=v['comment'])) mcursor.execute("CREATE DATABASE IF NOT EXISTS {}".format(db)) sql = ''' CREATE TABLE IF NOT EXISTS {db}.{table} ( {columns} )engine=InnoDB default charset=utf8mb4 '''.format(db=db, table=table, columns=",\n".join(cols)) logging.info(sql) mcursor.execute(sql) mcursor.close() return True # mysql表存在 mysql_columns = {} for (name, d_type) in res: name = name.lower().strip() mysql_columns[name] = d_type.lower().strip() sql = 'ALTER TABLE {db}.{table} '.format(db=db, table=table) for k, v in enumerate(columns): types = type_map.get(v['type'].lower().strip(), { "type": "varchar", "ext": "(255) not null default ''" }) mysql_coltype = mysql_columns.get(v['name'], None) if not mysql_coltype: if k == 0: alter_sql = "add `{name}` {type} comment '{comment}' first".format( name=v['name'], type=types['type'] + types['ext'], comment=v['comment']) else: alter_sql = "add `{name}` {type} comment '{comment}' after {prev}".format( name=v['name'], type=types['type'] + types['ext'], comment=v['comment'], prev=columns[k - 1]['name'].lower()) logging.info(sql + alter_sql) mcursor.execute(sql + alter_sql) else: if types['type'] != mysql_coltype: alter_sql = "change `{name}` `{name}` {type} comment '{comment}'".format( name=name, type=types['type'] + types['ext'], comment=v['comment']) logging.info(sql + alter_sql) mcursor.execute(sql + alter_sql) mcursor.close() return False
def get_data_from_impala(**op_kwargs): ds = op_kwargs.get( 'ds', time.strftime('%Y-%m-%d', time.localtime(time.time() - 86400))) sql = ''' WITH --线路数据 driver_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, id, real_name --司机名字 from obus_dw_ods.ods_sqoop_data_driver_df where dt='{pt}' ), --工作数据 work_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, dr.city_id, dr.id as driver_id, sum(if(dw.serv_mode=1 and dw.serv_mode1=0, round(abs(dw.create_time2-dw.create_time)/3600,2), 0)) as work_dur --司机今日在线时长(小时) from (select driver_id, serv_mode, create_time, lead(serv_mode,1,0) over(partition by driver_id order by create_time) serv_mode1, lead(create_time,1,unix_timestamp('{pt} 23:59:59','yyyy-MM-dd HH:mm:ss')) over(partition by driver_id order by create_time) create_time2 from obus_dw_ods.ods_sqoop_data_driver_work_log_df where dt='{pt}' and from_unixtime(create_time, 'yyyy-MM-dd')='{pt}' ) as dw join (select id, city_id from obus_dw_ods.ods_sqoop_data_driver_df where dt='{pt}' ) as dr on dw.driver_id = dr.id group by dr.city_id, dr.id ), --司机圈数 driver_cycle_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, driver_id, count(distinct id)/2 as cycle_cnt --司机圈数 from obus_dw_ods.ods_sqoop_data_driver_trip_df where dt = '{pt}' and from_unixtime(end_time, 'yyyy-MM-dd') = '{pt}' and status = 1 group by city_id, driver_id ), --司机驾驶时长 driver_time as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, city_id, driver_id, round(sum(end_time - start_time)/3600, 2) as driver_time --司机驾驶时长 from obus_dw_ods.ods_sqoop_data_driver_trip_df where dt = '{pt}' and from_unixtime(end_time, 'yyyy-MM-dd') = '{pt}' and status = 1 group by city_id, driver_id ), --收入数据 income_data as ( select from_unixtime(unix_timestamp('{pt}','yyyy-MM-dd'), 'yyyyMMdd') as dt, dd.city_id, dd.id, sum(ddrd.amount_true) as driver_amount, ---司机收入 sum(ddrd.amount_pay_obus) as obus_pay_driver_amount, ---Obus支付司机收入 sum(ddrd.amount_pay_ticket) as tickets_pay_driver_amount --公交卡支付司机收入 from (select driver_id, amount_true, amount_pay_obus, amount_pay_ticket from obus_dw_ods.ods_sqoop_data_driver_records_day_df where dt='{pt}' and `day`=unix_timestamp('{pt}','yyyy-MM-dd') ) as ddrd join (select id, city_id from obus_dw_ods.ods_sqoop_data_driver_df where dt='{pt}' ) as dd on ddrd.driver_id = dd.id group by dd.city_id, dd.id ) --结果集 select *, row_number() over(partition by city_id order by driver_amount desc) num from (select driver_data.dt, driver_data.city_id, nvl(dc.name, ''), driver_data.id, driver_data.real_name, IF(work_data.work_dur IS NULL, 0, work_data.work_dur), IF(driver_cycle_data.cycle_cnt IS NULL, 0, driver_cycle_data.cycle_cnt), round(if(driver_cycle_data.cycle_cnt>0, driver_time.driver_time/driver_cycle_data.cycle_cnt, 0), 2), IF(income_data.driver_amount IS NULL, 0, income_data.driver_amount) as driver_amount, IF(income_data.obus_pay_driver_amount IS NULL, 0, income_data.obus_pay_driver_amount), IF(income_data.tickets_pay_driver_amount IS NULL, 0, income_data.tickets_pay_driver_amount) from driver_data left join work_data on driver_data.dt=work_data.dt and driver_data.city_id=work_data.city_id and driver_data.id=work_data.driver_id left join driver_cycle_data on driver_data.dt = driver_cycle_data.dt and driver_data.city_id = driver_cycle_data.city_id and driver_data.id = driver_cycle_data.driver_id left join driver_time on driver_data.dt = driver_time.dt and driver_data.city_id = driver_time.city_id and driver_data.id = driver_time.driver_id left join income_data on driver_data.dt = income_data.dt and driver_data.city_id = income_data.city_id and driver_data.id = income_data.id left join (select id, name from obus_dw_ods.ods_sqoop_conf_city_df where dt='{pt}' and validate=1) as dc on driver_data.city_id = dc.id ) as t '''.format(pt=ds) logging.info(sql) hive_cursor = get_hive_cursor() hive_cursor.execute(sql) result = hive_cursor.fetchall() mysql_conn = get_db_conn('mysql_bi') mcursor = mysql_conn.cursor() __data_to_mysql( mcursor, result, [ 'dt', 'city_id', 'city', 'driver_id', 'driver_name', 'serv_time', 'cycle_cnt', 'avg_time', 'driver_amount', 'obus_pay_driver_amount', 'tickets_pay_driver_amount', 'num' ], ''' serv_time=values(serv_time), cycle_cnt=values(cycle_cnt), avg_time=values(avg_time), driver_amount=values(driver_amount), obus_pay_driver_amount=values(obus_pay_driver_amount), tickets_pay_driver_amount=values(tickets_pay_driver_amount), num=values(num) ''') hive_cursor.close() mcursor.close()
def first_driver_data(**op_kwargs): cursor = get_hive_cursor() dt = op_kwargs.get('ds') cursor.execute("SET mapreduce.job.queuename=root.airflow") cursor.execute("SET hive.exec.parallel=true") hql = """ SELECT uc.code, from_unixtime(unix_timestamp(ro.dt,'yyyy-MM-dd'), 'yyyyMMdd') AS day, COUNT(distinct ro.driver_id) as u, unix_timestamp() FROM (SELECT r.driver_id, p.code FROM (SELECT driver_id, know_orider_extend FROM oride_dw_ods.ods_sqoop_mass_rider_signups_df WHERE dt = '{ds}' and know_orider = 4 ) AS r JOIN (select code, name FROM oride_dw_ods.ods_sqoop_promoter_promoter_user_df WHERE dt='{ds}' ) AS p ON r.know_orider_extend = p.name ) AS uc JOIN (SELECT dt, driver_id, arrive_time, row_number() over(partition by driver_id order by arrive_time) orders FROM oride_dw.dwd_oride_order_base_include_test_di WHERE status IN (4,5) AND dt = '{ds}' ) as ro ON uc.driver_id = ro.driver_id WHERE ro.orders = 1 AND from_unixtime(ro.arrive_time,'yyyy-MM-dd')='{ds}' GROUP BY uc.code, ro.dt """.format(ds=dt) logging.info(hql) cursor.execute(hql) res = cursor.fetchall() mconn = get_db_conn('opay_spread_mysql') mysql = mconn.cursor() sql = 'insert into promoter_data_day (code, day, dft, create_time) values ' ext = ' on duplicate key update dft=values(dft), create_time=values(create_time)' vals = [] for (c, d, f, t) in res: vals.append("('{c}', '{d}', '{f}', '{t}')".format(c=c, d=d, f=f, t=t)) if len(vals) >= 1000: # logging.info(sql + ",".join(vals) + ext) mysql.execute(sql + ",".join(vals) + ext) vals = [] if len(vals) > 0: # logging.info(sql + ",".join(vals) + ext) mysql.execute(sql + ",".join(vals) + ext) mysql.close() cursor.close()