def export(): file_name = f_name.get().strip() if file_name == '': tk.messagebox.showerror(message='文件名不能为空') else: wb = openpyxl.Workbook() sheet = wb.active sheet['A1'] = '图书编号' sheet['B1'] = '图书名称' sheet['C1'] = '图书价格' sheet['D1'] = '图书类别' sheet['E1'] = '图书库存' query = ( "SELECT bid, bname, bprice, tname, bcount FROM `booktype`, `bookinfo` WHERE `booktype`.tid=`bookinfo`.tid") cnx = DbUtil.open_db() cursor = cnx.cursor() cursor.execute(query) i = 2 for (bid, bname, bprice, tname, bcount) in cursor: sheet['A%s' % i] = bid sheet['B%s' % i] = bname sheet['C%s' % i] = bprice sheet['D%s' % i] = tname sheet['E%s' % i] = bcount i = i + 1 DbUtil.close_db(cursor, cnx) time_file = time.strftime("%Y-%m-%d-%H-%M", time.localtime()) file_name = file_name + '-' + time_file + '.xlsx' wb.save(file_name) tk.messagebox.showinfo('成功', '导出图书信息成功') book_e.destroy()
def build_book(frame_book): for widget in frame_book.winfo_children(): widget.destroy() columns = ("书编号", "书名", "书价", "类别", "库存") treeview = ttk.Treeview(frame_book, columns=columns, show='headings') treeview.column("书编号", width=100, anchor='center') # 表示列,不显示 treeview.column("书名", width=100, anchor='center') treeview.column("书价", width=100, anchor='center') treeview.column("类别", width=100, anchor='center') treeview.column("库存", width=100, anchor='center') treeview.heading('书编号', text='书编号') treeview.heading('书名', text='书名') treeview.heading('书价', text='书价') treeview.heading('类别', text='类别') treeview.heading('库存', text='库存') treeview.grid() query = ( "SELECT bid, bname, bprice, tname, bcount FROM `booktype`, `bookinfo` WHERE `booktype`.tid=`bookinfo`.tid") cnx = DbUtil.open_db() cursor = cnx.cursor() cursor.execute(query) i = 1 for (bid, bname, bprice, tname, bcount) in cursor: treeview.insert('', i, values=(bid, bname, bprice, tname, bcount)) i = i + 1 DbUtil.close_db(cursor, cnx) for col in columns: # 给所有标题加(循环上边的“手工”) treeview.heading(col, text=col, command=lambda _col=col: treeview_sort_column(treeview, _col, False))
def inserttodb(): # 获取输入框内的内容 tid = t_id.get().strip() tidn = t_id_n.get().strip() tnamen = t_name_n.get().strip() if tid == '' or tidn == '' or tnamen == '': tk.messagebox.showerror(message='类别编号或名称为空') else: query = ("SELECT COUNT(*) FROM `booktype` WHERE tid='%s'" % (tid)) cnx = DbUtil.open_db() cursor = cnx.cursor() cursor.execute(query) if cursor.fetchone()[0] == 1: query = ("UPDATE `booktype` SET tid='%s', tname='%s' WHERE tid='%s'" % (tidn, tnamen, tid)) try: cursor.execute(query) cnx.commit() DbUtil.close_db(cursor, cnx) tk.messagebox.showinfo('成功', '修改类别成功') type_u.destroy() build_type(frame_type) except: tk.messagebox.showerror('错误', '修改类别失败') else: tk.messagebox.showerror('错误', '修改类别失败, 该类别编号不存在')
def update_flow_jobs(self, last_unixtime=None): if last_unixtime: flow_sql = """ SELECT a.app_code, f.* FROM flow f JOIN cfg_application a on f.app_id = a.app_id WHERE from_unixtime(modified_time) >= DATE_SUB(from_unixtime(%f), INTERVAL 1 HOUR) """ % last_unixtime else: flow_sql = """ SELECT a.app_code, f.* FROM flow f JOIN cfg_application a on f.app_id = a.app_id """ job_sql = """ SELECT * FROM flow_job WHERE app_id = %d and flow_id = %d """ url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) + '/wherehows/flow_jobs/_bulk' params = [] self.wh_cursor.execute(flow_sql) rows = DbUtil.copy_dict_cursor(self.wh_cursor) row_count = 1 for row in rows: self.wh_cursor.execute(job_sql %(long(row['app_id']), long(row['flow_id']))) jobs = [] job_rows = DbUtil.copy_dict_cursor(self.wh_cursor) if job_rows: for job_row in job_rows: jobs.append({"app_id": job_row['app_id'], "flow_id": job_row['flow_id'], "job_id": job_row['job_id'], "job_name": job_row['job_name'] if job_row['job_name'] else '', "job_path": job_row['job_path'] if job_row['job_path'] else '', "job_type_id": job_row['job_type_id'], "job_type": job_row['job_type'] if job_row['job_type'] else '', "pre_jobs": job_row['pre_jobs'] if job_row['pre_jobs'] else '', "post_jobs": job_row['post_jobs'] if job_row['post_jobs'] else '', "is_current": job_row['is_current'] if job_row['is_current'] else '', "is_first": job_row['is_first'] if job_row['is_first'] else '', "is_last": job_row['is_last'] if job_row['is_last'] else ''}) params.append('{ "index": { "_id": ' + str(long(row['flow_id'])*10000 + long(row['app_id'])) + ' }}') if len(jobs) > 0: params.append( """{"app_id": %d, "flow_id": %d, "app_code": "%s", "flow_name": "%s", "flow_group": "%s", "flow_path": "%s", "flow_level": %d, "is_active": "%s", "is_scheduled": "%s", "pre_flows": "%s", "jobs": %s}""" % (row['app_id'], row['flow_id'], row['app_code'] if row['app_code'] else '', row['flow_name'] if row['flow_name'] else '', row['flow_group'] if row['flow_group'] else '', row['flow_path'] if row['flow_path'] else '', row['flow_level'], row['is_active'] if row['is_active'] else '', row['is_scheduled'] if row['is_scheduled'] else '', row['pre_flows'] if row['pre_flows'] else '', json.dumps(jobs))) else: params.append( """{"app_id": %d, "flow_id": %d, "app_code": "%s", "flow_name": "%s", "flow_group": "%s", "flow_path": "%s", "flow_level": %d, "is_active": "%s", "is_scheduled": "%s", "pre_flows": "%s", "jobs": ""}""" % (row['app_id'], row['flow_id'], row['app_code'] if row['app_code'] else '', row['flow_name'] if row['flow_name'] else '', row['flow_group'] if row['flow_group'] else '', row['flow_path'] if row['flow_path'] else '', row['flow_level'], row['is_active'] if row['is_active'] else '', row['is_scheduled'] if row['is_scheduled'] else '', row['pre_flows'] if row['pre_flows'] else '')) if row_count % 1000 == 0: self.bulk_insert(params, url) self.logger.info('flow jobs' + str(row_count)) params = [] row_count += 1 if len(params) > 0: self.logger.info('flow_jobs' + str(len(params))) self.bulk_insert(params, url)
def usr_log_in(window, var_usr_name, var_usr_pwd): # 输入框获取用户名密码 usr_name = var_usr_name.get().strip() usr_pwd = var_usr_pwd.get().strip() # 用户名密码不能为空 if usr_name == '' or usr_pwd == '': tk.messagebox.showerror(message='用户名或密码不能为空!') else: # 从数据库中获取用户信息 query = ("SELECT COUNT(*) FROM `user` WHERE username = '******'" % usr_name) cnx = DbUtil.open_db() cursor = cnx.cursor() cursor.execute(query) if cursor.fetchone()[0] == 1: # 判断用户名和密码是否匹配 query = ("SELECT username, password, is_manager FROM `user` WHERE username = '******' AND password = '******'" % ( usr_name, usr_pwd)) cursor.execute(query) result = cursor.fetchone() DbUtil.close_db(cursor, cnx) if result is not None: # tk.messagebox.showinfo(title='welcome', message='欢迎您:' + usr_name) # 进入主界面 is_manger = False if (result[2] == 0) else True manager_main(window, usr_name, is_manger) else: tk.messagebox.showerror(message='密码错误') # 不在数据库中弹出是否注册的框 else: is_signup = tk.messagebox.askyesno('欢迎', '您还没有注册,是否现在注册') if is_signup: usr_sign_up()
def inserttodb(): # 获取输入框内的内容 bido = b_id_o.get().strip() bid = b_id.get().strip() bname = b_name.get().strip() bprice = b_price.get().strip() tid = t_id.get().strip() bcount = b_count.get().strip() if bido == '' or bid == '' or bname == '' or bprice == '' or tid == '' or bcount == '': tk.messagebox.showerror(message='输入框不能为空') else: query = ("SELECT COUNT(*) FROM `bookinfo` WHERE bid='%s'" % (bido)) cnx = DbUtil.open_db() cursor = cnx.cursor() cursor.execute(query) if cursor.fetchone()[0] == 1: query = ( "UPDATE `bookinfo` SET bid='%s', bname='%s', bprice='%s', tid='%s', bcount='%s' WHERE tid='%s'" % ( bid, bname, bprice, tid, bcount, bido)) try: cursor.execute(query) cnx.commit() DbUtil.close_db(cursor, cnx) tk.messagebox.showinfo('成功', '修改图书信息成功') book_u.destroy() build_book(frame_book) except: tk.messagebox.showerror('错误', '修改图书信息失败') else: tk.messagebox.showerror('错误', '修改图书信息失败, 该图书编号不存在')
def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info("collect flow&jobs") flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) query = """ SELECT a.*, b.created_time FROM (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time from WF_JOBS w LEFT JOIN WF_JOBS s ON w.app_path = s.app_path AND w.created_time < s.created_time WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a JOIN (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b ON a.app_path = b.app_path """ self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: flow_record = OozieFlowRecord(self.app_id, row['app_name'], row['app_path'], 0, row['source_version'], row['created_time'], row['last_modified_time'], self.wh_exec_id) flow_writer.append(flow_record) query = """ select name, type, transition from WF_ACTIONS where wf_id = '{source_version}' """.format(source_version=row['source_version']) new_oz_cursor = self.oz_con.cursor() new_oz_cursor.execute(query) nodes = DbUtil.dict_cursor(new_oz_cursor) for node in nodes: job_record = OozieJobRecord(self.app_id, row['app_path'], row['source_version'], node['name'], row['app_path'] + "/" + node['name'], node['type'], self.wh_exec_id) job_writer.append(job_record) if node['transition'] != "*" and node['transition'] is not None: dag_edge = OozieFlowDagRecord(self.app_id, row['app_path'], row['source_version'], row['app_path'] + "/" + node['name'], row['app_path'] + "/" + node['transition'], self.wh_exec_id) dag_writer.append(dag_edge) new_oz_cursor.close() dag_writer.close() job_writer.close() flow_writer.close()
def insertEMPLOYEE(self, param): sql = """INSERT INTO EMPLOYEE(FIRST_NAME, LAST_NAME, AGE, SEX, INCOME) VALUES ('${FIRST_NAME}', '${LAST_NAME}', ${AGE}, '${SEX}', ${INCOME})""" for key in param: sql = sql.replace("${" + key + "}", param[key], 1) print sql DbUtil.getCursor(self.db).execute(sql)
def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info("collect flow&jobs") flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) query = """ SELECT a.*, b.created_time FROM (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time from WF_JOBS w LEFT JOIN WF_JOBS s ON w.app_path = s.app_path AND w.created_time < s.created_time WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a JOIN (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b ON a.app_path = b.app_path """ self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: flow_record = OozieFlowRecord(self.app_id, row['app_name'], row['app_path'], 0, row['source_version'], row['created_time'], row['last_modified_time'], self.wh_exec_id) flow_writer.append(flow_record) query = """ select name, type, transition from WF_ACTIONS where wf_id = '{source_version}' """.format(source_version=row['source_version']) new_oz_cursor = self.oz_con.cursor() new_oz_cursor.execute(query) nodes = DbUtil.dict_cursor(new_oz_cursor) for node in nodes: job_record = OozieJobRecord( self.app_id, row['app_path'], row['source_version'], node['name'], row['app_path'] + "/" + node['name'], node['type'], self.wh_exec_id) job_writer.append(job_record) if node['transition'] != "*" and node['transition'] is not None: dag_edge = OozieFlowDagRecord( self.app_id, row['app_path'], row['source_version'], row['app_path'] + "/" + node['name'], row['app_path'] + "/" + node['transition'], self.wh_exec_id) dag_writer.append(dag_edge) new_oz_cursor.close() dag_writer.close() job_writer.close() flow_writer.close()
def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period): self.logger.info("collect flow&job executions") flow_exec_writer = FileWriter(flow_exec_file) job_exec_writer = FileWriter(job_exec_file) cmd = "SELECT * FROM workflow_info WHERE status is NULL" self.lz_cursor.execute(cmd) # rows = DbUtil.dict_cursor(self.lz_cursor) rows = DbUtil.copy_dict_cursor(self.lz_cursor) row_count = 0 for row in rows: flow_path = row['project_name'] + ":" + row['workflow_name'] flow_exec_record = LhotseFlowExecRecord( self.app_id, row["workflow_name"], flow_path, 0, 1, "SUCCEEDED", 1, row['owner'], long(time.mktime(row['create_time'].timetuple())), long(time.mktime(row['modify_time'].timetuple())), self.wh_exec_id) flow_exec_writer.append(flow_exec_record) job_exec_records = [] task_query = "SELECT * FROM task_info WHERE workflow_id = \"{0}\"".format( row['workflow_id']) new_lz_cursor = self.lz_cursor new_lz_cursor.execute(task_query) task_rows = DbUtil.dict_cursor(new_lz_cursor) for task in task_rows: if task['real_task_id'] is None: continue job_exec_record = LhotseJobExecRecord( self.app_id, flow_path, 0, 1, task['task_name'], flow_path + "/" + task['task_name'], long(task['real_task_id']), 'SUCCEEDED', 1, int(time.mktime(task['create_time'].timetuple())), int(time.mktime(task['modify_time'].timetuple())), self.wh_exec_id) job_exec_records.append(job_exec_record) ## LhotseJobExecRecord.sortAndSet(job_exec_records) for r in job_exec_records: job_exec_writer.append(r) row_count += 1 if row_count % 10000 == 0: flow_exec_writer.flush() job_exec_writer.flush() flow_exec_writer.close() job_exec_writer.close()
def update_dataset_field(self, last_time): if last_time: sql = """ SELECT * FROM dict_field_detail WHERE modified >= DATE_SUB(%s, INTERVAL 1 HOUR) """ % last_time else: sql = """ SELECT * FROM dict_field_detail """ comment_query = """ SELECT d.field_id, d.dataset_id, f.comment FROM dict_dataset_field_comment d LEFT JOIN field_comments f ON d.comment_id = f.id WHERE d.field_id = %d """ url = self.elasticsearch_index_url + ':' + str( self.elasticsearch_port) + '/wherehows/field/_bulk' params = [] self.wh_cursor.execute(sql) rows = DbUtil.copy_dict_cursor(self.wh_cursor) row_count = 1 for row in rows: self.wh_cursor.execute(comment_query % long(row['field_id'])) comments = [] comment_rows = DbUtil.copy_dict_cursor(self.wh_cursor) for comment_row in comment_rows: comments.append(comment_row['comment']) params.append('{ "index": { "_id": ' + str(row['field_id']) + ', "parent": ' + str(row['dataset_id']) + ' }}') if len(comments) > 0: params.append( """{ "comments": %s, "dataset_id": %d, "sort_id": %d, "field_name": "%s", "parent_path": "%s"}""" % (json.dumps(comments) if comments else '', row['dataset_id'] if row['dataset_id'] else 0, row['sort_id'] if row['sort_id'] else 0, row['field_name'] if row['field_name'] else '', row['parent_path'] if row['parent_path'] else '')) else: params.append( """{ "comments": "", "dataset_id": %d, "sort_id": %d, "field_name": "%s", "parent_path": "%s"}""" % (row['dataset_id'] if row['dataset_id'] else 0, row['sort_id'] if row['sort_id'] else 0, row['field_name'] if row['field_name'] else '', row['parent_path'] if row['parent_path'] else '')) if row_count % 1000 == 0: self.bulk_insert(params, url) params = [] row_count += 1 if len(params) > 0: self.bulk_insert(params, url)
def update_dataset(self, last_unixtime): if last_unixtime: sql = """ SELECT * FROM dict_dataset WHERE from_unixtime(modified_time) >= DATE_SUB(from_unixtime(%f), INTERVAL 1 HOUR) """ % last_unixtime else: sql = """ SELECT * FROM dict_dataset """ url = self.elasticsearch_index_url + ':' + str( self.elasticsearch_port) + '/wherehows/dataset/_bulk' params = [] self.wh_cursor.execute(sql) rows = DbUtil.copy_dict_cursor(self.wh_cursor) row_count = 1 for row in rows: params.append('{ "index": { "_id": ' + str(row['id']) + ' }}') params.append( """{ "name": "%s", "source": "%s", "urn": "%s", "location_prefix": "%s", "parent_name": "%s", "schema_type": "%s", "properties": %s, "schema": %s , "fields": %s}""" % (row['name'] if row['name'] else '', row['source'] if row['source'] else '', row['urn'] if row['urn'] else '', row['location_prefix'] if row['location_prefix'] else '', row['parent_name'] if row['parent_name'] else '', row['schema_type'] if row['schema_type'] else '', json.dumps(row['properties']) if row['properties'] else '', json.dumps(row['schema']) if row['schema'] else '', json.dumps(row['fields']) if row['fields'] else '')) if row_count % 1000 == 0: self.bulk_insert(params, url) params = [] row_count += 1 if len(params) > 0: self.bulk_insert(params, url)
def collect_job_execs(self, job_exec_file, lookback_period): self.logger.info("collect job execs") job_exec_writer = FileWriter(job_exec_file) query = """ select a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count, unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time, j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE """ % (int(lookback_period)) self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: job_exec_record = OozieJobExecRecord(self.app_id, row['app_path'], row['flow_exec_id'], row['flow_exec_id'], row['job_name'], row['app_path'] + "/" + row['job_name'], row['job_exec_id'], row['status'], row['user_retry_count'], row['start_time'], row['end_time'], self.wh_exec_id) job_exec_writer.append(job_exec_record) job_exec_writer.close()
def collect_flow_schedules(self, schedule_file): # load flow scheduling info from table triggers self.logger.info("collect flow schedule") timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) schedule_writer = FileWriter(schedule_file) query = \ """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE, ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED, ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END FROM SO_JOB_TABLE J JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """ self.aw_cursor.execute(query) rows = DbUtil.dict_cursor(self.aw_cursor) for row in rows: schedule_record = AppworxFlowScheduleRecord(self.app_id, row['SO_APPLICATION'] + ":" + row['SO_MODULE'], row['AW_SCH_NAME'], int(row['AW_SCH_INTERVAL']), long(row['EFFECT_STARTED']), long(row['EFFECT_END']), '0', self.wh_exec_id ) schedule_writer.append(schedule_record) schedule_writer.close()
def update_comment(self, last_time): if last_time: sql = """ SELECT * FROM comments WHERE modified >= DATE_SUB(%s, INTERVAL 1 HOUR) """ % last_time else: sql = """ SELECT * FROM comments """ url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) + '/wherehows/comment/_bulk' params = [] self.wh_cursor.execute(sql) rows = DbUtil.copy_dict_cursor(self.wh_cursor) row_count = 1 for row in rows: params.append('{ "index": { "_id": ' + str(row['id']) + ', "parent": ' + str(row['dataset_id']) + ' }}') params.append( """{ "text": %s, "user_id": %d, "dataset_id": %d, "comment_type": "%s"}""" % (json.dumps(row['text']) if row['text'] else '', row['user_id'] if row['user_id'] else 0, row['dataset_id'] if row['dataset_id'] else 0, row['comment_type'] if row['comment_type'] else '')) if row_count % 1000 == 0: self.bulk_insert(params, url) params = [] row_count += 1 if len(params) > 0: self.bulk_insert(params, url)
def collect_flow_schedules(self, schedule_file): # load flow scheduling info from table triggers self.logger.info("collect flow schedule") timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) schedule_writer = FileWriter(schedule_file) query = \ """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE, ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED, ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END FROM SO_JOB_TABLE J JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """ self.aw_cursor.execute(query) rows = DbUtil.dict_cursor(self.aw_cursor) for row in rows: schedule_record = AppworxFlowScheduleRecord( self.app_id, row['SO_APPLICATION'] + ":" + row['SO_MODULE'], row['AW_SCH_NAME'], int(row['AW_SCH_INTERVAL']), long(row['EFFECT_STARTED']), long(row['EFFECT_END']), '0', self.wh_exec_id) schedule_writer.append(schedule_record) schedule_writer.close()
def update_dataset(self, last_unixtime): if last_unixtime: sql = """ SELECT * FROM dict_dataset WHERE from_unixtime(modified_time) >= DATE_SUB(from_unixtime(%f), INTERVAL 1 HOUR) """ % last_unixtime else: sql = """ SELECT * FROM dict_dataset """ url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) + '/wherehows/dataset/_bulk' params = [] self.wh_cursor.execute(sql) rows = DbUtil.copy_dict_cursor(self.wh_cursor) row_count = 1 for row in rows: params.append('{ "index": { "_id": ' + str(row['id']) + ' }}') params.append( """{ "name": "%s", "source": "%s", "urn": "%s", "location_prefix": "%s", "parent_name": "%s", "schema_type": "%s", "properties": %s, "schema": %s , "fields": %s}""" % (row['name'] if row['name'] else '', row['source'] if row['source'] else '', row['urn'] if row['urn'] else '', row['location_prefix'] if row['location_prefix'] else '', row['parent_name'] if row['parent_name'] else '', row['schema_type'] if row['schema_type'] else '', json.dumps(row['properties']) if row['properties'] else '', json.dumps(row['schema']) if row['schema'] else '', json.dumps(row['fields']) if row['fields'] else '')) if row_count % 1000 == 0: self.bulk_insert(params, url) params = [] row_count += 1 if len(params) > 0: self.bulk_insert(params, url)
def __init__(self, master): self.data = DbUtil.DbUtil() self.master = master self.master.resizable(0, 0) master.title("Menu") self.mainFrame = Frame(master) self.mainFrame.pack() self.label = Label(self.mainFrame, text="Wybierz kategorię") self.label.pack() #Stwórz listę kategorii self.selectCategoty = Listbox(self.mainFrame, selectmode=SINGLE) for i in self.data.data: self.selectCategoty.insert(0, i) self.selectCategoty.select_set(0) self.selectCategoty.pack(fill=X) #-----------------------# self.scale = Scale(self.mainFrame, from_=5, to=15, orient=HORIZONTAL, label=' Ilość słówek') self.scale.pack(fill=X) self.startButton = Button(self.mainFrame, text="Start", command=self.openCurseWindow) self.startButton.pack() self.wordCounter = 0 self.mistakeCounter = 0 self.gotWords = False
def update_comment(self, last_time): if last_time: sql = """ SELECT * FROM comments WHERE modified >= DATE_SUB(%s, INTERVAL 1 HOUR) """ % last_time else: sql = """ SELECT * FROM comments """ url = self.elasticsearch_index_url + ':' + str( self.elasticsearch_port) + '/wherehows/comment/_bulk' params = [] self.wh_cursor.execute(sql) rows = DbUtil.copy_dict_cursor(self.wh_cursor) row_count = 1 for row in rows: params.append('{ "index": { "_id": ' + str(row['id']) + ', "parent": ' + str(row['dataset_id']) + ' }}') params.append( """{ "text": %s, "user_id": %d, "dataset_id": %d, "comment_type": "%s"}""" % (json.dumps(row['text']) if row['text'] else '', row['user_id'] if row['user_id'] else 0, row['dataset_id'] if row['dataset_id'] else 0, row['comment_type'] if row['comment_type'] else '')) if row_count % 1000 == 0: self.bulk_insert(params, url) params = [] row_count += 1 if len(params) > 0: self.bulk_insert(params, url)
def collect_flow_schedules(self, schedule_file): self.logger.info("collect flow schedule") schedule_writer = FileWriter(schedule_file) query = """ SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit, unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time, wj.app_path FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id WHERE cj.status = 'RUNNING' """ self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: schedule_record = OozieFlowScheduleRecord(self.app_id, row['app_path'], row['time_unit'], str(row['frequency']), None, row['start_time'], row['end_time'], row['ref_id'], self.wh_exec_id) schedule_writer.append(schedule_record) schedule_writer.close()
def __init__(self, configs): super(BackupData, self).__init__() self.__db = DbUtil.DbUtil( configs['dbhost'], Config.INPUT_DB_USERNAME, Config.INPUT_DB_PASSWORD, Config.INPUT_DB_DATABASE if configs['dbname'] is None else configs['dbname'], Config.INPUT_DB_CHARSET) self.__logger = Logger.Logger(__name__)
def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period): self.logger.info( "collect flow&job executions") flow_exec_writer = FileWriter(flow_exec_file) job_exec_writer = FileWriter(job_exec_file) cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period)) self.az_cursor.execute(cmd) rows = DbUtil.dict_cursor(self.az_cursor) row_count = 0 for row in rows: json_column = 'flow_data' unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read() try: row[json_column] = json.loads(unzipped_content) except Exception as e: self.logger.error(e) pass flow_data = row[json_column] flow_path = flow_data['projectName'] + ":" + flow_data['flowId'] flow_exec_record = AzkabanFlowExecRecord(self.app_id, flow_data['flowId'], flow_path, row['version'], row['exec_id'], flow_data['status'], flow_data['attempt'], row['submit_user'], long(row['start_time']) / 1000, long(row['end_time']) / 1000, self.wh_exec_id) flow_exec_writer.append(flow_exec_record) nodes = flow_data['nodes'] job_exec_records = [] for node in nodes: job_exec_record = AzkabanJobExecRecord(self.app_id, flow_path, row['version'], row['exec_id'], node['id'], flow_path + "/" + node['id'], None, node['status'], node['attempt'], long(node['startTime']) / 1000, long(node['endTime']) / 1000, self.wh_exec_id) job_exec_records.append(job_exec_record) AzkabanJobExecUtil.sortAndSet(job_exec_records) for r in job_exec_records: job_exec_writer.append(r) row_count += 1 if row_count % 10000 == 0: flow_exec_writer.flush() job_exec_writer.flush() flow_exec_writer.close() job_exec_writer.close()
def submit_blog(): title, blog = (request.form['title'], request.form['blog']) # app.logger.debug(request.form['blog']) # x=open('temp.txt','w') # x.write(request.form['blog']) # x.close() time=datetime.datetime.now().ctime() res = DbUtil.save_blog(title,time,blog) return "res"
def saveItemInfo(self, param): sql = """INSERT INTO item_coupon(tao_token,coupon_short_link_url,qr_code_url,click_url,coupon_link_tao_token ,coupon_link,type,item_id,short_link_url,coupon_start_time ,coupon_end_time,coupon_amount,coupon_info,biz30day,tk_comm_fee ,coupon_left_count,coupon_total_count,coupon_start_fee,reserve_price) VALUES (${taoToken},${couponShortLinkUrl},${qrCodeUrl},${clickUrl},${couponLinkTaoToken}, ${couponLink},${type},${itemid},${shortLinkUrl},${couponEffectiveStartTime}, ${couponEffectiveEndTime},${couponAmount},${couponInfo},${biz30day},${tkCommFee} ,${couponLeftCount},${couponTotalCount},${couponStartFee},${reservePrice})""" for key in param: value = '' if param[key] == None: value = 'Null' else: value = "'" + param[key] + "'" sql = sql.replace("${" + key + "}", value, 1) print sql DbUtil.getCursor(self.db).execute(sql)
def __init__(self): super(CreateTable, self).__init__() self.__configs = CommonUtil.parse_arguments() self.__db = DbUtil.DbUtil( self.__configs['dbhost'], Config.INPUT_DB_USERNAME, Config.INPUT_DB_PASSWORD, Config.INPUT_DB_DATABASE if configs['dbname'] is None else configs['dbname'], Config.INPUT_DB_CHARSET) self.__logger = Logger.Logger(__name__)
def __init__(self, configs): super(CalcTeacherMetric, self).__init__() self.__db = DbUtil.DbUtil( configs['dbhost'], Config.INPUT_DB_USERNAME, Config.INPUT_DB_PASSWORD, Config.INPUT_DB_DATABASE if configs['dbname'] is None else configs['dbname'], Config.INPUT_DB_CHARSET) self.__logger = Logger.Logger(__name__) self.__delimiter = '@'
def update_dataset_field(self, last_time): if last_time: sql = """ SELECT * FROM dict_field_detail WHERE modified >= DATE_SUB(%s, INTERVAL 1 HOUR) """ % last_time else: sql = """ SELECT * FROM dict_field_detail """ comment_query = """ SELECT d.field_id, d.dataset_id, f.comment FROM dict_dataset_field_comment d LEFT JOIN field_comments f ON d.comment_id = f.id WHERE d.field_id = %d """ url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) + '/wherehows/field/_bulk' params = [] self.wh_cursor.execute(sql) rows = DbUtil.copy_dict_cursor(self.wh_cursor) row_count = 1 for row in rows: self.wh_cursor.execute(comment_query % long(row['field_id'])) comments = [] comment_rows = DbUtil.copy_dict_cursor(self.wh_cursor) for comment_row in comment_rows: comments.append(comment_row['comment']) params.append('{ "index": { "_id": ' + str(row['field_id']) + ', "parent": ' + str(row['dataset_id']) + ' }}') if len(comments) > 0: params.append( """{ "comments": %s, "dataset_id": %d, "sort_id": %d, "field_name": "%s", "parent_path": "%s"}""" % (json.dumps(comments) if comments else '', row['dataset_id'] if row['dataset_id'] else 0, row['sort_id'] if row['sort_id'] else 0, row['field_name'] if row['field_name'] else '', row['parent_path'] if row['parent_path'] else '')) else: params.append( """{ "comments": "", "dataset_id": %d, "sort_id": %d, "field_name": "%s", "parent_path": "%s"}""" % (row['dataset_id'] if row['dataset_id'] else 0, row['sort_id'] if row['sort_id'] else 0, row['field_name'] if row['field_name'] else '', row['parent_path'] if row['parent_path'] else '')) if row_count % 1000 == 0: self.bulk_insert(params, url) params = [] row_count += 1 if len(params) > 0: self.bulk_insert(params, url)
def signtowcg(): # 获取输入框内的内容 nun = new_username.get().strip() np = new_pwd.get().strip() npf = new_pwd_confirm.get().strip() nn = new_name.get().strip() ng = new_gender.get().strip() ne = new_email.get().strip() nt = new_telephone.get().strip() nm = new_manager.get().strip() if np == '' or nun == '' or npf == '' or nn == '' or ng == '' or ne == '' or nt == '' or nm == '': tk.messagebox.showerror('错误', '输入框不能为空!') elif np != npf: tk.messagebox.showerror('错误', '密码前后不一致') elif re.match(r'^[a-zA-Z0-9_.-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.[a-zA-Z0-9]{2,6}$', ne) is None: tk.messagebox.showerror('错误', '邮箱格式不正确') elif re.match(r'^1([38][0-9]|4[579]|5[0-3,5-9]|6[6]|7[0135678]|9[89])\d{8}$', nt) is None: tk.messagebox.showerror('错误', '手机号格式不正确') else: query = ("SELECT COUNT(*) FROM `user` WHERE username = '******'" % (nun)) cnx = DbUtil.open_db() cursor = cnx.cursor() cursor.execute(query) if cursor.fetchone()[0] != 0: tk.messagebox.showerror('错误', '用户名已存在') else: query = ("INSERT INTO `user`(username, password, is_manager) VALUES ('%s', '%s', '%s')" % (nun, np, nm)) query1 = ( "INSERT INTO `userinfo`(username, password, `name`, gender, email, telephone) VALUES ('%s', '%s', '%s', '%s', '%s', '%s')" % ( nun, np, nn, ng, ne, nt)) try: cursor.execute(query) cursor.execute(query1) cnx.commit() DbUtil.close_db(cursor, cnx) tk.messagebox.showinfo('欢迎', '注册成功') # 注册成功关闭注册框 window_sign_up.destroy() except: print() tk.messagebox.showinfo('错误', '注册失败') cnx.rollback()
def inserttodb(): # 获取输入框内的内容 tid = t_id.get().strip() tname = t_name.get().strip() if tid == '' or tname == '': tk.messagebox.showerror(message='类别编号或名称为空') else: query = ("INSERT INTO `booktype`(tid, tname) VALUES ('%s', '%s')" % (tid, tname)) try: cnx = DbUtil.open_db() cursor = cnx.cursor() cursor.execute(query) cnx.commit() DbUtil.close_db(cursor, cnx) tk.messagebox.showinfo('成功', '新增类别成功') type_c.destroy() build_type(frame_type) except: tk.messagebox.showerror('错误', '新增类别失败')
def db_lookup(self, dbname, default=None): query = \ """ SELECT db_id FROM cfg_database WHERE db_code = '%s' or short_connection_string = '%s' """ self.aw_cursor.execute(query % (dbname,dbname)) rows = DbUtil.dict_cursor(self.aw_cursor) for row in rows: return row['db_id'] return 0
def collect_flow_schedules(self, schedule_file): # load flow scheduling info from table triggers self.logger.info("collect flow schedule") schedule_writer = FileWriter(schedule_file) query = "select * from triggers" self.az_cursor.execute(query) rows = DbUtil.dict_cursor(self.az_cursor) for row in rows: json_column = 'data' if row[json_column] != None: unzipped_content = gzip.GzipFile( mode='r', fileobj=StringIO.StringIO( row[json_column].tostring())).read() try: row[json_column] = json.loads(unzipped_content) except Exception as e: self.logger.error(e) pass if not "projectId" in row[json_column]["actions"][0][ "actionJson"]: continue # print json.dumps(row[json_column], indent=4) if row[json_column]["triggerCondition"]["checkers"][0][ "checkerJson"]["isRecurring"] == 'true': unit, frequency, cron_expr = None, None, None period = row[json_column]["triggerCondition"]["checkers"][ 0]["checkerJson"]["period"] if period is not None and period != "null" and period[ -1:] in self._period_unit_table: unit = self._period_unit_table[period[-1:]] frequency = int( row[json_column]["triggerCondition"]["checkers"][0] ["checkerJson"]["period"][:-1]) if "cronExpression" in row[json_column][ "triggerCondition"]["checkers"][0]["checkerJson"]: cron_expr = row[json_column]["triggerCondition"][ "checkers"][0]["checkerJson"]["cronExpression"] schedule_record = AzkabanFlowScheduleRecord( self.app_id, row[json_column]["actions"][0] ["actionJson"]["projectName"] + ':' + row[json_column] ["actions"][0]["actionJson"]["flowName"], unit, frequency, cron_expr, long(row[json_column]["triggerCondition"]["checkers"] [0]["checkerJson"]["firstCheckTime"]) / 1000, int( time.mktime( datetime.date(2099, 12, 31).timetuple())), '0', self.wh_exec_id) schedule_writer.append(schedule_record) schedule_writer.close()
def getItemInfoInDB(self): sql = """SELECT tao_token as taoToken, click_url as clickUrl, coupon_link_tao_token as couponLinkTaoToken, coupon_link as couponLink, short_link_url as shortLinkUrl, coupon_short_link_url as couponShortLinkUrl FROM item_coupon where item_id = ${itemid}""" cur = DbUtil.getCursor(self.db) cur.execute(sql) return cur.fetchone()
def __init__(self, configs): super(Preprocessor, self).__init__() self.__db = DbUtil.DbUtil( configs['dbhost'], Config.INPUT_DB_USERNAME, Config.INPUT_DB_PASSWORD, Config.INPUT_DB_DATABASE if configs['dbname'] is None else configs['dbname'], Config.INPUT_DB_CHARSET) self.__logger = Logger.Logger(__name__) self.__teaching = True if configs[ 'teaching'] == 1 else False # teaching=1 表示评估课堂的教学情况 self.__teacher = True if configs[ 'teaching'] == 2 else False # teaching=2 表示基于S-T评估教师的教学情况
def collect_flow_owners(self, owner_file): self.logger.info("collect owners") owner_writer = FileWriter(owner_file) query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS" self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: owner_record = OozieFlowOwnerRecord(self.app_id, row['app_path'], row['user_name'], self.wh_exec_id) owner_writer.append(owner_record) owner_writer.close()
def update_metric(self): sql = """ SELECT * FROM dict_business_metric """ url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) + '/wherehows/metric/_bulk' params = [] self.wh_cursor.execute(sql) rows = DbUtil.copy_dict_cursor(self.wh_cursor) row_count = 1 for row in rows: params.append('{ "index": { "_id": ' + str(row['metric_id']) + ' }}') params.append( """{"metric_id": %d, "metric_name": %s, "metric_description": %s, "dashboard_name": %s, "metric_group": %s, "metric_category": %s, "metric_sub_category": %s, "metric_level": %s, "metric_source_type": %s, "metric_source": %s, "metric_source_dataset_id": %d, "metric_ref_id_type": %s, "metric_ref_id": %s, "metric_type": %s, "metric_additive_type": %s, "metric_grain": %s, "metric_display_factor": %f, "metric_display_factor_sym": %s, "metric_good_direction": %s, "metric_formula": %s, "dimensions": %s, "owners": %s, "tags": %s, "urn": %s, "metric_url": %s, "wiki_url": %s, "scm_url": %s}""" % (row['metric_id'], json.dumps(row['metric_name']) if row['metric_name'] else json.dumps(''), json.dumps(row['metric_description']) if row['metric_description'] else json.dumps(''), json.dumps(row['dashboard_name']) if row['dashboard_name'] else json.dumps(''), json.dumps(row['metric_group']) if row['metric_group'] else json.dumps(''), json.dumps(row['metric_category']) if row['metric_category'] else json.dumps(''), json.dumps(row['metric_sub_category']) if row['metric_sub_category'] else json.dumps(''), json.dumps(row['metric_level']) if row['metric_level'] else json.dumps(''), json.dumps(row['metric_source_type']) if row['metric_source_type'] else json.dumps(''), json.dumps(row['metric_source']) if row['metric_source'] else json.dumps(''), row['metric_source_dataset_id'] if row['metric_source_dataset_id'] else 0, json.dumps(row['metric_ref_id_type']) if row['metric_ref_id_type'] else json.dumps(''), json.dumps(row['metric_ref_id']) if row['metric_ref_id'] else json.dumps(''), json.dumps(row['metric_type']) if row['metric_type'] else json.dumps(''), json.dumps(row['metric_additive_type']) if row['metric_additive_type'] else json.dumps(''), json.dumps(row['metric_grain']) if row['metric_grain'] else json.dumps(''), row['metric_display_factor'] if row['metric_display_factor'] else 0.0, json.dumps(row['metric_display_factor_sym']) if row['metric_display_factor_sym'] else json.dumps(''), json.dumps(row['metric_good_direction']) if row['metric_good_direction'] else json.dumps(''), json.dumps(row['metric_formula']) if row['metric_formula'] else json.dumps(''), json.dumps(row['dimensions']) if row['dimensions'] else json.dumps(''), json.dumps(row['owners']) if row['owners'] else json.dumps(''), json.dumps(row['tags']) if row['tags'] else json.dumps(''), json.dumps(row['urn']) if row['urn'] else json.dumps(''), json.dumps(row['metric_url']) if row['metric_url'] else json.dumps(''), json.dumps(row['wiki_url']) if row['wiki_url'] else json.dumps(''), json.dumps(row['scm_url']) if row['scm_url'] else json.dumps(''))) if row_count % 1000 == 0: self.bulk_insert(params, url) params = [] row_count += 1 if len(params) > 0: self.bulk_insert(params, url)
def delete_type_by_id(): # 获取输入框内的内容 tid = t_id.get().strip() if tid == '': tk.messagebox.showerror(message='类别编号为空') else: query = ("SELECT COUNT(*) FROM `booktype` WHERE tid='%s'" % (tid)) cnx = DbUtil.open_db() cursor = cnx.cursor() cursor.execute(query) if cursor.fetchone()[0] == 1: query = ("DELETE FROM `booktype` WHERE tid='%s'" % (tid)) try: cursor.execute(query) cnx.commit() DbUtil.close_db(cursor, cnx) tk.messagebox.showinfo('成功', '删除类别成功') type_d.destroy() build_type(frame_type) except: tk.messagebox.showerror('错误', '删除类别失败') else: tk.messagebox.showerror('错误', '删除类别失败, 该类别编号不存在')
def saveCategoryToDb(categoryList=None): print '--- saveCategoryToDb start ---' if categoryList is None or len(categoryList) == 0: print "categoryList is None could not save to db." return else: insert = 'insert into sh_category (cate_id, cate_name, parent_id, parent_cate_name, sort_num, cate_item_num,' \ ' description, listUrl, shopName, shopLogo, update_time) ' sql_select_str = '' currentTime = int(time.time()) for category in categoryList: sql_select_str += "SELECT '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' union " \ % (category.cate_id, category.cate_name, category.parent_id, category.parent_cate_name, category.sort_num, category.cate_item_num, category.description, category.listUrl, category.shopName, category.shopLogo, currentTime) # 拼接sql 语句 insert = insert + sql_select_str # 截取字符串 insert = insert[:-6] # 保存category 需要先清空数据 delete = 'delete from sh_category;' DbUtil.delete(delete) DbUtil.insert(insert)
def inserttodb(): # 获取输入框内的内容 bid = b_id.get().strip() bname = b_name.get().strip() bprice = b_price.get().strip() tid = t_id.get().strip() bcount = b_count.get().strip() if bid == '' or bname == '' or bprice == '' or tid == '' or bcount == '': tk.messagebox.showerror(message='输入框不能为空') else: query = ("INSERT INTO `bookinfo`(bid, bname, bprice, tid, bcount) VALUES ('%s', '%s', '%s', '%s', '%s')" % ( bid, bname, bprice, tid, bcount)) try: cnx = DbUtil.open_db() cursor = cnx.cursor() cursor.execute(query) cnx.commit() DbUtil.close_db(cursor, cnx) tk.messagebox.showinfo('成功', '新增图书信息成功') book_c.destroy() build_book(frame_book) except: tk.messagebox.showerror('错误', '新增图书信息失败')
def delete_book_by_id(): # 获取输入框内的内容 bid = b_id.get().strip() if bid == '': tk.messagebox.showerror(message='图书编号为空') else: query = ("SELECT COUNT(*) FROM `bookinfo` WHERE bid='%s'" % (bid)) cnx = DbUtil.open_db() cursor = cnx.cursor() cursor.execute(query) if cursor.fetchone()[0] == 1: query = ("DELETE FROM `bookinfo` WHERE bid='%s'" % (bid)) try: cursor.execute(query) cnx.commit() DbUtil.close_db(cursor, cnx) tk.messagebox.showinfo('成功', '删除图书成功') book_d.destroy() build_book(frame_book) except: tk.messagebox.showerror('错误', '删除图书失败') else: tk.messagebox.showerror('错误', '删除图书失败, 该图书编号不存在')
def collect_flow_schedules(self, schedule_file): # load flow scheduling info from table triggers self.logger.info("collect flow schedule") schedule_writer = FileWriter(schedule_file) query = "select * from triggers" self.az_cursor.execute(query) rows = DbUtil.dict_cursor(self.az_cursor) for row in rows: json_column = "data" if row[json_column] != None: unzipped_content = gzip.GzipFile( mode="r", fileobj=StringIO.StringIO(row[json_column].tostring()) ).read() try: row[json_column] = json.loads(unzipped_content) except Exception as e: self.logger.error(e) pass if not "projectId" in row[json_column]["actions"][0]["actionJson"]: continue # print json.dumps(row[json_column], indent=4) if row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["isRecurring"] == "true": unit, frequency, cron_expr = None, None, None period = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"] if period is not None and period != "null" and period[-1:] in self._period_unit_table: unit = self._period_unit_table[period[-1:]] frequency = int( row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"][:-1] ) if "cronExpression" in row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]: cron_expr = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["cronExpression"] schedule_record = AzkabanFlowScheduleRecord( self.app_id, row[json_column]["actions"][0]["actionJson"]["projectName"] + ":" + row[json_column]["actions"][0]["actionJson"]["flowName"], unit, frequency, cron_expr, long(row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["firstCheckTime"]) / 1000, int(time.mktime(datetime.date(2099, 12, 31).timetuple())), "0", self.wh_exec_id, ) schedule_writer.append(schedule_record) schedule_writer.close()
def get_last_execution_unix_time(self): if self.last_execution_unix_time is None: try: query = """ SELECT MAX(end_time) as last_time FROM job_execution where app_id = %d """ self.wh_cursor.execute(query % self.app_id) rows = DbUtil.dict_cursor(self.wh_cursor) if rows: for row in rows: self.last_execution_unix_time = long(row['last_time']) break except: self.logger.error("Get the last execution time from job_execution failed") self.last_execution_unix_time = None return self.last_execution_unix_time
def get_last_execution_unix_time(self): if self.last_execution_unix_time is None: try: query = """ SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage """ self.wh_cursor.execute(query) rows = DbUtil.dict_cursor(self.wh_cursor) if rows: for row in rows: self.last_execution_unix_time = row['last_time'] break except: self.logger.error("Get the last execution time from job_execution_data_lineage failed") self.last_execution_unix_time = None return self.last_execution_unix_time
def collect_flow_owners(self, owner_file): # load user info from table project_permissions self.logger.info("collect owner&permissions") user_writer = FileWriter(owner_file) query = "select f.flow_id, p.name as project_name, p.version as project_verison, pp.name as owner, pp.permissions, pp.isGroup " \ "from project_flows f join project_permissions pp on f.project_id = pp.project_id join projects p on f.project_id = p.id where p.active = 1" self.az_cursor.execute(query) rows = DbUtil.dict_cursor(self.az_cursor) for row in rows: record = AzkabanFlowOwnerRecord(self.app_id, row['project_name'] + ':' + row["flow_id"], row["owner"], AzkabanPermission(row["permissions"]).toFlatString(), 'GROUP' if row['isGroup'] == 1 else 'LDAP', self.wh_exec_id) user_writer.append(record) user_writer.close()
def get_last_execution_unix_time(self): if self.last_execution_unix_time is None: try: query = """ SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage where app_id = %d """ self.aw_cursor.execute(query % self.app_id) rows = DbUtil.dict_cursor(self.aw_cursor) if rows: for row in rows: self.last_execution_unix_time = row['last_time'] break except: self.logger.error("Get the last execution time from job_execution_data_lineage failed") self.last_execution_unix_time = None ts = int(time.time()) if self.last_execution_unix_time is not None and (ts - self.last_execution_unix_time) > 5*60*60: self.logger.info('last execution unix time is:' + str(self.last_execution_unix_time)) self.last_execution_unix_time = None return self.last_execution_unix_time
def get_log_file_name(self, module_name, days_offset=1): if self.last_execution_unix_time: query = \ """select je.*, fj.job_type, fl.flow_path, CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)) as log_file_name, CONCAT('%s', DATE_FORMAT(FROM_UNIXTIME(je.end_time), '%%Y%%m%%d'), '/', CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)), '.gz') as gzipped_file_name from job_execution je JOIN flow_job fj on je.app_id = fj.app_id and je.flow_id = fj.flow_id and fj.is_current = 'Y' and je.job_id = fj.job_id JOIN flow fl on fj.app_id = fl.app_id and fj.flow_id = fl.flow_id WHERE je.app_id = %d and je.end_time >= %d - 3660 and fj.job_type = '%s' ORDER BY je.flow_exec_id DESC, je.job_exec_id """ self.logger.info(query % (self.aw_archive_dir, self.app_id, long(self.last_execution_unix_time), module_name)) self.aw_cursor.execute(query % (self.aw_archive_dir, self.app_id, long(self.last_execution_unix_time), module_name)) else: query = \ """select je.*, fj.job_type, fl.flow_path, CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)) as log_file_name, CONCAT('%s', DATE_FORMAT(FROM_UNIXTIME(je.end_time), '%%Y%%m%%d'), '/', CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)), '.gz') as gzipped_file_name from job_execution je JOIN flow_job fj on je.app_id = fj.app_id and je.flow_id = fj.flow_id and fj.is_current = 'Y' and je.job_id = fj.job_id JOIN flow fl on fj.app_id = fl.app_id and fj.flow_id = fl.flow_id WHERE je.app_id = %d and je.job_exec_status in ('FINISHED', 'SUCCEEDED', 'OK') and je.end_time >= unix_timestamp(CURRENT_DATE - INTERVAL %d DAY) and fj.job_type = '%s' ORDER BY je.flow_exec_id DESC, je.job_exec_id """ self.logger.info(query % (self.aw_archive_dir, self.app_id, int(self.look_back_days), module_name)) self.aw_cursor.execute(query % (self.aw_archive_dir, self.app_id, int(self.look_back_days), module_name)) job_rows = DbUtil.copy_dict_cursor(self.aw_cursor) self.logger.info("%d job executions will be scanned for lineage" % len(job_rows)) return job_rows
def collect_flow_execs(self, flow_exec_file, lookback_period): self.logger.info("collect flow execs") flow_exec_writer = FileWriter(flow_exec_file) query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % (int(lookback_period)) self.oz_cursor.execute(query) rows = DbUtil.dict_cursor(self.oz_cursor) for row in rows: flow_exec_record = OozieFlowExecRecord(self.app_id, row['app_name'], row['app_path'], row['id'], row['id'], row['status'], row['run'], row['user_name'], row['start_time'], row['end_time'], self.wh_exec_id) flow_exec_writer.append(flow_exec_record) flow_exec_writer.close()
def collect_flow_owners(self, owner_file): self.logger.info("collect owner&permissions") timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) user_writer = FileWriter(owner_file) query = \ """SELECT DISTINCT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, U.SO_USER_NAME FROM SO_JOB_TABLE J JOIN SO_JOB_HISTORY H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ self.aw_cursor.execute(query) rows = DbUtil.dict_cursor(self.aw_cursor) for row in rows: record = AppworxFlowOwnerRecord(self.app_id, row['SO_APPLICATION'] + ':' + row["SO_MODULE"], row["SO_USER_NAME"], 'EXECUTE', 'GROUP', self.wh_exec_id) user_writer.append(record) user_writer.close()
def process_li_getreplacemergegeneral(self, module_name): self.logger.info("process %s" % module_name) if module_name not in ['LI_GETREPLACEMERGEGENERAL', 'LINKEDIN_SHELL','LI_WEBHDFS_GET']: return parameter_query = \ """ SELECT GROUP_CONCAT(param_value,'\x01') args FROM job_parameter WHERE app_id = %d AND job_exec_id = %d AND attempt_number = %d ORDER BY param_no """ rows = self.get_log_file_name(module_name=module_name) for row in rows: try: self.logger.info('Parsing log file: %s' % row['gzipped_file_name']) if module_name == 'LI_GETREPLACEMERGEGENERAL': self.aw_cursor.execute(parameter_query % (int(row['app_id']), int(row['job_exec_id']), int(row['attempt_id']))) arg_values = DbUtil.copy_dict_cursor(self.aw_cursor) if arg_values and len(arg_values) > 0: args = arg_values[0]['args'] results = AppworxLogParser(log_file_name = row['gzipped_file_name']).parse_log({}, command_type=module_name) if any(results) == 0 or not 'cluster' in results: self.logger.info('Skipped parsing %s' % row['gzipped_file_name']) continue self.logger.info(str(results)) matched_cluster = re.match(r'(.*)_.*', results['cluster']) if matched_cluster is not None: results['cluster'] = matched_cluster.group(1) db_id = int(self.db_lookup(results['cluster'])) self.logger.info(str(db_id)) self.clean_up_staging_lineage_dbs(row['app_id'], row['job_id'], row['job_exec_id'], row['attempt_id']) update_source_code_query = \ """ INSERT INTO job_attempt_source_code( application_id, job_id, attempt_number, script_name, script_path, script_type, created_date) VALUES( %d, %d, %d, '%s', '%s', '%s', now()) """ % (int(row['app_id']), int(row['job_id']), int(row['attempt_id']), results['script_name'], results['script_path'], results['script_type'] if module_name == 'LI_WEBHDFS_GET' else 'Shell') update_staging_lineage_query = \ """ INSERT IGNORE INTO stg_job_execution_data_lineage ( app_id, flow_exec_id, job_exec_id, flow_path, job_name, job_start_unixtime, job_finished_unixtime, db_id, abstracted_object_name, full_object_name, partition_start, partition_end, partition_type, storage_type, source_target_type, srl_no, source_srl_no, operation, record_count, insert_count, created_date, wh_etl_exec_id) SELECT %d, %d, %d, '%s', '%s', %d, %d, %d, '%s', '%s', CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END, CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END, CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END, '%s', '%s', %d, CASE WHEN %d = 0 THEN NULL ELSE %d END, '%s', %d, %d, UNIX_TIMESTAMP(now()), %d FROM dual """ self.aw_cursor.execute(update_source_code_query ) self.aw_con.commit() for k, tab in enumerate(results['table']): self.aw_cursor.execute(update_staging_lineage_query % (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']), row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']), db_id if tab['table_type'] == 'source' else 0, tab['abstracted_path'], tab['full_path'], tab['start_partition'], tab['start_partition'], tab['end_partition'] if tab['frequency'] != 'snapshot' else None, tab['end_partition'] if tab['frequency'] != 'snapshot' else None, tab['frequency'], tab['frequency'], tab['storage_type'], tab['table_type'], k + 1, 0, 0, 'Hadoop Get' if tab['table_type'] == 'source' else 'Move', 0, 0, int(row['wh_etl_exec_id']) )) self.aw_con.commit() self.logger.debug('Completed processing metadata for log file: %s' % row['gzipped_file_name']) except: self.logger.error(str(sys.exc_info()[0]))
def process_li_shell_gw(self): self.logger.info("process li_shell_gw") parameter_query = \ """ SELECT param_value FROM job_parameter WHERE app_id = %d AND job_exec_id = %d AND attempt_number = %d ORDER BY param_no """ rows = self.get_log_file_name(module_name='LI_SHELL_GW') self.logger.info(str(len(rows))) for row in rows: self.aw_cursor.execute(parameter_query % (int(row['app_id']), int(row['job_exec_id']), int(row['attempt_id']))) arg_values = DbUtil.copy_dict_cursor(self.aw_cursor) if arg_values and len(arg_values) > 0 and 'param_value' in arg_values[0]: args = arg_values[0]['param_value'].replace('"','') m = re.match("(.*?/trim.sh)\s+\d+\s+\d+", args) if m is not None: shell_script_name = os.path.basename(m.group(1)) shell_script_path = os.path.dirname(m.group(1)) app_path = shell_script_path.replace(self.remote_hadoop_script_dir,'') local_shell_script_path = self.local_script_path + self.remote_script_path + app_path self.logger.info(local_shell_script_path + '/' + shell_script_name) shell_script_content = open(local_shell_script_path + '/' + shell_script_name).read() pig_script = re.search(r'pig\s+\-f\s+(.*?)\s+',shell_script_content) if pig_script is not None: pig_script_name = os.path.basename(pig_script.group(1)) pig_script_path = os.path.dirname(pig_script.group(1)) # Compare md5 string of the file with the one parsed last time md5_str = hashlib.md5() md5_str.update(shell_script_content) self.logger.info('Parsing log file: %s' % row['gzipped_file_name']) src_tgt_map = PigLogParser(log_file_name=row['gzipped_file_name']).simple_parser() if any(src_tgt_map) == 0: self.logger.warn('Pig log parser fails to retrieve relevant information from file: %s. Most probably Pig script did not complete successfully' % row['gzipped_file_name']) continue self.logger.info(str(src_tgt_map)) db_id = self.db_lookup(src_tgt_map['cluster']) self.clean_up_staging_lineage_dbs(row['app_id'], row['job_id'], row['job_exec_id'], row['attempt_id']) update_source_code_query = \ """ INSERT INTO job_attempt_source_code( application_id, job_id, attempt_number, script_name, script_path, script_type, created_date) VALUES( %d, %d, %d, '%s', '%s', '%s', now()) """ % (int(row['app_id']), int(row['job_id']), int(row['attempt_id']), pig_script_name, pig_script_path, 'Pig') self.aw_cursor.execute(update_source_code_query ) update_staging_lineage_query = \ """ INSERT IGNORE INTO stg_job_execution_data_lineage ( app_id, flow_exec_id, job_exec_id, flow_path, job_name, job_start_unixtime, job_finished_unixtime, db_id, abstracted_object_name, full_object_name, partition_start, partition_end, partition_type, storage_type, source_target_type, srl_no, source_srl_no, operation, record_count, insert_count, created_date, wh_etl_exec_id) SELECT %d, %d, %d, '%s', '%s', %d, %d, %d, '%s', '%s', CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END, CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END, CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END, '%s', '%s', %d, CASE WHEN %d = 0 THEN NULL ELSE %d END, '%s', %d, %d, UNIX_TIMESTAMP(now()), %d FROM dual """ srl = 0 detail_srl = 1 for k,v in src_tgt_map.items(): if k in ['cluster'] or k in ['hive-cluster']: continue srl = srl + 1 self.aw_cursor.execute(update_staging_lineage_query % (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']), row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']), db_id, src_tgt_map[k]['abstracted_hdfs_path'], None, src_tgt_map[k]['min_start_partition'], src_tgt_map[k]['min_start_partition'], src_tgt_map[k]['max_end_partition'] if src_tgt_map[k]['table_type'] != 'snapshot' else None, src_tgt_map[k]['max_end_partition'] if src_tgt_map[k]['table_type'] != 'snapshot' else None, src_tgt_map[k]['partition_type'], src_tgt_map[k]['partition_type'], 'HDFS', src_tgt_map[k]['table_type'], int(srl), 0, 0, 'Load' if src_tgt_map[k]['table_type'] == 'source' else 'Store', 0, 0, int(row['wh_etl_exec_id']) )) srl = srl + 1 self.aw_con.commit() self.logger.debug('Completed writing metadata for: %s' % row['gzipped_file_name']) else: self.logger.error("Fails to get Pig script file name used in GW shell script: %s" % m.group(1))
def run(self): sql = """ SELECT a.application_id, a.job_id, a.attempt_number, a.script_path, a.script_type, a.script_name, fl.flow_name as chain_name, fj.job_name, CASE WHEN e.dir_short_path IS NOT NULL THEN CASE WHEN LOCATE('/scripts',a.script_path) > 0 THEN CONCAT('https://gitli.corp.linkedin.com', CONCAT(CONCAT(REPLACE(e.dir_short_path,'/code_base_','/source/code_base_'), SUBSTR(a.script_path,LOCATE('/scripts',a.script_path)) ), CONCAT('/',a.script_name) )) WHEN LOCATE('/sql',a.script_path) > 0 THEN CONCAT('https://gitli.corp.linkedin.com', CONCAT(CONCAT(REPLACE(e.dir_short_path,'/code_base_','/source/code_base_'), SUBSTR(a.script_path,LOCATE('/sql',a.script_path))), CONCAT('/',a.script_name))) END ELSE a.script_name END script_url FROM job_attempt_source_code a JOIN flow fl ON fl.app_id = a.application_id JOIN flow_job fj ON a.application_id = fj.app_id and fl.flow_id = fj.flow_id and a.job_id = fj.job_id LEFT OUTER JOIN cfg_file_system_dir_map d ON (d.directory_path = CASE WHEN LOCATE('/scripts',a.script_path) > 0 THEN SUBSTR(a.script_path,1,LOCATE('/scripts',a.script_path) - 1) WHEN LOCATE('/sql',a.script_path) > 0 THEN SUBSTR(a.script_path,1,LOCATE('/sql',a.script_path) - 1) END) LEFT OUTER JOIN cfg_scm_repository_dir e ON (d.map_to_object_id = e.scm_repo_dir_id) """ git_committer = """ SELECT git_path, commit_time, committer_name, committer_email, ui.user_id FROM ( SELECT CONCAT(CONCAT(CONCAT(CONCAT(repository_base_name,'/'), CONCAT(module_name,'/source/')),file_path), file_name) as git_path, committer_name, commit_time, committer_email FROM source_code_repository_info ) a LEFT JOIN dir_external_user_info ui on a.committer_name = ui.full_name WHERE a.git_path = '%s' GROUP BY committer_name """ insert_sql = """ INSERT IGNORE INTO job_execution_script ( app_id, job_id, script_name, script_path, script_type, chain_name, job_name, committer_name, committer_email, committer_ldap, commit_time, script_url) SELECT %d, %d, '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' FROM dual """ try: self.logger.info('script collect') if self.last_execution_unix_time: sql += ' WHERE a.created_date >= from_unixtime(%d) - INTERVAL 1 DAY '% long(self.last_execution_unix_time) else: sql += ' WHERE a.created_date >= CURRENT_DATE - INTERVAL %d DAY '% long(self.look_back_days) self.logger.info(sql) self.wh_cursor.execute(sql) rows = DbUtil.copy_dict_cursor(self.wh_cursor) for row in rows: git_url = row['script_url'].replace('https://', 'git://') self.wh_cursor.execute(git_committer % git_url) git_rows = DbUtil.copy_dict_cursor(self.wh_cursor) if git_rows and len(git_rows) > 0: for git_row in git_rows: self.wh_cursor.execute(insert_sql % (int(row['application_id']), int(row['job_id']), row['script_name'], row['script_path'], row['script_type'], row['chain_name'], row['job_name'], git_row['committer_name'], git_row['committer_email'], git_row['user_id'] if git_row['user_id'] else git_row['committer_name'], git_row['commit_time'], row['script_url'] )) else: self.logger.info("git rows size is 0") self.logger.info(row['script_name']) self.wh_cursor.execute(insert_sql % (int(row['application_id']), int(row['job_id']), row['script_name'], row['script_path'], row['script_type'], row['chain_name'], row['job_name'], "", "", "", "", row['script_url'] )) self.wh_con.commit() finally: self.wh_cursor.close() self.wh_con.close()
def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info("collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]" % (self.last_execution_unix_time, self.lookback_period)) timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) if self.last_execution_unix_time: time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long(self.last_execution_unix_time) else: time_filter = "SYSDATE - %d" % int(self.lookback_period) flow_query = \ """SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID FROM SO_JOB_TABLE J JOIN ( SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID FROM ( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= %s AND SO_CHILD_COUNT > 0 UNION ALL SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED') AND SO_CHILD_COUNT > 0 ) GROUP BY SO_JOB_SEQ ) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ WHERE SO_COMMAND_TYPE = 'CHAIN' ORDER BY 2,3 """ % time_filter job_query = \ """SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID, t.* FROM SO_CHAIN_DETAIL d JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ WHERE d.SO_CHAIN_SEQ = %d ORDER BY d.SO_CHAIN_ORDER """ self.aw_cursor.execute(flow_query) rows = DbUtil.dict_cursor(self.aw_cursor) flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) row_count = 0 for row in rows: flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE'] flow_record = AppworxFlowRecord(self.app_id, long(row['SO_JOB_SEQ']), row['SO_MODULE'], row['SO_APPLICATION'], flow_path, 0, 0, 0, 'Y', self.wh_exec_id) flow_writer.append(flow_record) new_appworx_cursor = self.aw_con.cursor() new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ']) job_rows = DbUtil.dict_cursor(new_appworx_cursor) for job in job_rows: job_record = AppworxJobRecord(self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0, long(job['JOB_ID']), job['SO_TASK_NAME'], flow_path + '/' + job['SO_TASK_NAME'], job['SO_MODULE'], 'Y', self.wh_exec_id) command_type = job['SO_COMMAND_TYPE'] if command_type and command_type == 'CHAIN': job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" + job['SO_MODULE']) job_record.setJobType('CHAIN') job_writer.append(job_record) predecessors_str = job['PREDECESSORS'] if predecessors_str: predecessors = re.findall(r"\&\/(.+?)\s\=\sS", predecessors_str) if predecessors: for predecessor in predecessors: dag_edge = AppworxFlowDagRecord(self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0, flow_path + '/' + predecessor, flow_path + '/' + job['SO_TASK_NAME'], self.wh_exec_id) dag_writer.append(dag_edge) row_count += 1 if row_count % 1000 == 0: flow_writer.flush() job_writer.flush() dag_writer.flush() flow_writer.close() job_writer.close() dag_writer.close()
def index(): return render_template("index.html", title_list=DbUtil.load_blog_titles())
def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period): self.logger.info("collect flow&job executions [last_execution_unix_time=%s lookback_period=%s]" % (self.last_execution_unix_time, self.lookback_period)) flow_exec_writer = FileWriter(flow_exec_file) job_exec_writer = FileWriter(job_exec_file) timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'" self.aw_cursor.execute(timezone) schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX" self.aw_cursor.execute(schema) flow_id_list = [] if self.last_execution_unix_time: flow_cmd = \ """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED, U.SO_USER_NAME FROM SO_JOB_TABLE J JOIN ( SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400 AND SO_CHILD_COUNT > 0 UNION ALL SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED') AND SO_CHILD_COUNT > 0 ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % long(self.last_execution_unix_time) else: flow_cmd = \ """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED, U.SO_USER_NAME FROM SO_JOB_TABLE J JOIN ( SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= SYSDATE - %d AND SO_CHILD_COUNT > 0 UNION ALL SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED') AND SO_CHILD_COUNT > 0 ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % int(self.lookback_period) ''' SO_CHAIN_ID = :flow_exec_id will find all job executions under the top level flow select SO_EXECUTE_ORDER, SO_JOBID, SO_PARENTS_JOBID, SO_DIRECT_PARENT_JOBID, SO_CHAIN_ID from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID ''' if self.last_execution_unix_time: job_cmd = \ """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED FROM SO_JOB_HISTORY H JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and H.SO_CHAIN_ID = %d""" else: job_cmd = \ """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED, ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) - to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED FROM SO_JOB_HISTORY H JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and H.SO_CHAIN_ID = %d""" try: self.aw_cursor.execute(flow_cmd) except Exception as e: self.logger.error(e + "\n" + flow_cmd) rows = DbUtil.dict_cursor(self.aw_cursor) row_count = 0 for row in rows: flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE'] so_flow_id = row['SO_JOBID'] flow_attempt = 0 flow_exec_id = 0 try: flow_attempt = int(float(str(so_flow_id - int(so_flow_id))[1:])*100) flow_exec_id = int(so_flow_id) except Exception as e: self.logger.error(e) self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id) flow_exec_record = AppworxFlowExecRecord(self.app_id, long(row['SO_JOB_SEQ']), row['SO_MODULE'], flow_path, 0, flow_exec_id, row['SO_STATUS_NAME'], flow_attempt, row['SO_USER_NAME'] if row['SO_USER_NAME'] else '', long(row['JOB_STARTED']), long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0), self.wh_exec_id) flow_exec_writer.append(flow_exec_record) new_appworx_cursor = self.aw_con.cursor() if self.last_execution_unix_time: new_appworx_cursor.execute(job_cmd % (long(self.last_execution_unix_time), flow_exec_id)) else: new_appworx_cursor.execute(job_cmd % (int(self.lookback_period), flow_exec_id)) job_rows = DbUtil.dict_cursor(new_appworx_cursor) for job in job_rows: so_job_id = job['SO_JOBID'] job_attempt = 0 job_exec_id = 0 try: job_attempt = int(float(str(so_job_id - int(so_job_id))[1:])*100) job_exec_id = int(so_job_id) except Exception as e: self.logger.error(e) job_exec_record = AppworxJobExecRecord(self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0, flow_exec_id, long(job['JOB_ID']), job['SO_TASK_NAME'], flow_path + "/" + job['SO_TASK_NAME'], job_exec_id, job['SO_STATUS_NAME'], job_attempt, long(job['JOB_STARTED']), long(job['JOB_FINISHED']), self.wh_exec_id) job_exec_writer.append(job_exec_record) row_count += 1 if row_count % 10000 == 0: flow_exec_writer.flush() job_exec_writer.flush() flow_exec_writer.close() job_exec_writer.close()
def process_li_bteq(self): self.logger.info("process li bteq") bteq_rows = self.get_log_file_name(module_name='LI_BTEQ') kfk_rows = self.get_log_file_name(module_name='KFK_SONORA_STG_TO_FACT') rows = bteq_rows + kfk_rows parameter_query = \ """ SELECT param_value FROM job_parameter WHERE app_id = %d AND job_exec_id = %d AND attempt_number = %d ORDER BY param_no """ check_parsed_source_code_query = \ """ SELECT application_id, job_id, attempt_number FROM job_attempt_source_code WHERE script_name = '%s' AND script_path = '%s' AND script_md5_sum = CASE WHEN '%s' = 'None' THEN NULL ELSE UNHEX('%s') END AND application_id = %d ORDER BY job_id DESC LIMIT 1 """ check_parsed_lineage_query = \ """ SELECT * FROM stg_job_execution_data_lineage WHERE app_id = %d AND job_exec_id = %d """ update_staging_lineage_query = \ """ INSERT IGNORE INTO stg_job_execution_data_lineage ( app_id, flow_exec_id, job_exec_id, flow_path, job_name, job_start_unixtime, job_finished_unixtime, db_id, abstracted_object_name, full_object_name, partition_start, partition_end, partition_type, storage_type, source_target_type, srl_no, source_srl_no, operation, record_count, insert_count, created_date, wh_etl_exec_id) SELECT %d, %d, %d, '%s', '%s', %d, %d, %d, '%s', '%s', CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END, CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END, CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END, '%s', '%s', %d, CASE WHEN %d = 0 THEN NULL ELSE %d END, '%s', %d, %d, UNIX_TIMESTAMP(now()), %d FROM dual """ for row in rows: try: self.logger.info('Parsing log file: %s' % row['gzipped_file_name']) self.aw_cursor.execute(parameter_query % (int(row['app_id']), int(row['job_exec_id']), int(row['attempt_id']))) params = DbUtil.copy_dict_cursor(self.aw_cursor) key_values = {} for param in params: ParseUtil.value_from_cmd_line(param['param_value'], key_values) analyzed_script = False results = AppworxLogParser(log_file_name = row['gzipped_file_name']).parse_log({}, command_type='LI_BTEQ') if any(results) == 0: self.logger.info('Skipped parsing %s' % row['gzipped_file_name']) continue self.logger.info(str(results)) self.logger.info('Completed parsing log file: %s' % row['gzipped_file_name']) # Compare md5 string of the file with the one parsed last time md5_str = hashlib.md5() try: md5_str.update(open(self.local_script_path + results['script_path'] + '/' + results['script_name']).read()) except IOError: self.logger.warn("Fails to find script file: %s/%s. Skipping the file" % (results['script_path'], results['script_name'])) continue self.aw_cursor.execute(check_parsed_source_code_query % (results['script_name'], results['script_path'], md5_str, md5_str.hexdigest(), int(row['app_id']))) parsed_scripts = DbUtil.copy_dict_cursor(self.aw_cursor) if len(parsed_scripts) != 0: self.aw_cursor.execute(check_parsed_lineage_query % (int(row['app_id']), int(row['job_exec_id']))) parsed_lineage = DbUtil.copy_dict_cursor(self.aw_cursor) if len(parsed_lineage) == 0 or self.skip_already_parsed == 'N': analyzed_script = False else: self.logger.debug("%s/%s has already been analyzed. Skipping..." % (results['script_path'], results['script_name'])) analyzed_script = True self.clean_up_staging_lineage_dbs(row['app_id'], row['job_id'], row['job_exec_id'], row['attempt_id']) update_source_code_query = \ """ INSERT INTO job_attempt_source_code( application_id, job_id, attempt_number, script_name, script_path, script_type, created_date, script_md5_sum) VALUES( %d, %d, %d, '%s', '%s', '%s', now(), CASE WHEN '%s' = 'None' THEN NULL ELSE UNHEX('%s') END) """ % (int(row['app_id']), int(row['job_id']), int(row['attempt_id']), results['script_name'], results['script_path'], 'SQL', md5_str, md5_str.hexdigest()) self.aw_cursor.execute(update_source_code_query ) self.aw_con.commit() db_id = 0 try: db_id = self.db_lookup(results['host']) except KeyError: self.logger.error(sys.exc_info()[0]) bteq_load_srl = 1 if not analyzed_script: if 'table' in results: schema_name = '' if 'full_path' in results['table'][0] and results['table'][0]['full_path']: index = results['table'][0]['full_path'].index('.') if index != -1: schema_name = results['table'][0]['full_path'][0:index] elif 'schema_name' in results['table'][0] and results['table'][0]['schema_name']: schema_name = results['table'][0]['schema_name'] self.aw_cursor.execute(update_staging_lineage_query % (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']), row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']), 0, ('/' + schema_name + '/' + results['table'][0]['abstracted_path']) \ if schema_name else results['table'][0]['abstracted_path'], results['table'][0]['full_path'], None, None, None, None, None, None, results['table'][0]['storage_type'], results['table'][0]['table_type'], bteq_load_srl, 0, 0, 'Read', 0, 0, int(row['wh_etl_exec_id']) )) bteq_load_srl = bteq_load_srl + 1 schema_name = '' if 'full_path' in results['table'][1] and results['table'][1]['full_path']: full_table_name = results['table'][1]['full_path'] index = full_table_name.index('.') if index != -1: schema_name = full_table_name[0:index] elif 'schema_name' in results['table'][1] and results['table'][1]['schema_name']: full_table_name = results['table'][1]['schema_name'] + '.' + results['table'][1]['table_name'] schema_name = results['table'][1]['schema_name'] else: full_table_name = results['table'][1]['table_name'] self.aw_cursor.execute(update_staging_lineage_query % (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']), row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']), db_id, ('/' + schema_name + '/' + results['table'][1]['table_name']) \ if schema_name else results['table'][1]['table_name'], full_table_name, None, None, None, None, None, None, 'Teradata', results['table'][0]['table_type'], bteq_load_srl, 0, 0, 'Load', 0, 0, int(row['wh_etl_exec_id']) )) self.aw_con.commit() self.logger.info("Parsing script: %s/%s" % (results['script_path'], results['script_name'])) entries = BteqLogParser().parse( key_values, self.local_script_path + results['script_path'] + '/' + results['script_name'], results['script_path'], results['script_name'], self.bteq_source_target_override, self.metric_override ) metric_idx = 1 for srl, e in enumerate(entries): schema_name, table_name = ParseUtil.get_db_table_abstraction(e['relation']) full_table_name = schema_name + '.' + table_name self.aw_cursor.execute(update_staging_lineage_query % (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']), row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']), db_id, '/' + schema_name + '/' + table_name, full_table_name, None, None, None, None, None, None, e['storage_type'], e['table_type'], (bteq_load_srl + srl + 1), 0, 0, e['operation'], 0, 0, int(row['wh_etl_exec_id']) )) self.aw_con.commit() else: for p in parsed_lineage: p['database_id'] = db_id self.aw_cursor.execute(update_staging_lineage_query % (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']), row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']), p['database_id'], p['abstracted_object_name'], p['full_object_name'], p['partition_start'], p['partition_start'], p['partition_end'], p['partition_end'], p['partition_type'], p['partition_type'], p['storage_type'], p['source_target_type'], p['srl_no'], 0, 0, p['operation'], 0, 0, int(row['wh_etl_exec_id']) )) self.aw_con.commit() self.logger.info('Completed processing metadata for log file: %s' % row['gzipped_file_name']) except: self.logger.error(str(sys.exc_info()[0]))
def blogpage(title): blog = DbUtil.load_blog(title) return render_template("blog.html", blog=blog)
def collect_flow_jobs(self, flow_file, job_file, dag_file): self.logger.info("collect flow&jobs") query = "SELECT distinct f.*, p.name as project_name FROM project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1" self.az_cursor.execute(query) rows = DbUtil.dict_cursor(self.az_cursor) flow_writer = FileWriter(flow_file) job_writer = FileWriter(job_file) dag_writer = FileWriter(dag_file) row_count = 0 for row in rows: row['version'] = 0 if (row["version"] is None) else row["version"] json_column = 'json' unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read() try: row[json_column] = json.loads(unzipped_content) except: pass flow_path = row['project_name'] + ":" + row['flow_id'] flow_record = AzkabanFlowRecord(self.app_id, row['flow_id'], row['project_name'], flow_path, 0, row['modified_time'] / 1000, row["version"], 'Y', self.wh_exec_id) flow_writer.append(flow_record) # get flow jobs nodes = row[json_column]['nodes'] for node in nodes: job_record = AzkabanJobRecord(self.app_id, flow_path, row["version"], node['id'], flow_path + '/' + node['id'], node['jobType'], 'Y', self.wh_exec_id) if node['jobType'] == 'flow': job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId']) job_writer.append(job_record) # job dag edges = row[json_column]['edges'] for edge in edges: dag_edge = AzkabanFlowDagRecord(self.app_id, flow_path, row['version'], flow_path + '/' + edge['source'], flow_path + '/' + edge['target'], self.wh_exec_id) dag_writer.append(dag_edge) row_count += 1 if row_count % 1000 == 0: flow_writer.flush() job_writer.flush() dag_writer.flush() flow_writer.close() job_writer.close() dag_writer.close()