Example #1
0
 def export():
     file_name = f_name.get().strip()
     if file_name == '':
         tk.messagebox.showerror(message='文件名不能为空')
     else:
         wb = openpyxl.Workbook()
         sheet = wb.active
         sheet['A1'] = '图书编号'
         sheet['B1'] = '图书名称'
         sheet['C1'] = '图书价格'
         sheet['D1'] = '图书类别'
         sheet['E1'] = '图书库存'
         query = (
             "SELECT bid, bname, bprice, tname, bcount FROM `booktype`, `bookinfo` WHERE `booktype`.tid=`bookinfo`.tid")
         cnx = DbUtil.open_db()
         cursor = cnx.cursor()
         cursor.execute(query)
         i = 2
         for (bid, bname, bprice, tname, bcount) in cursor:
             sheet['A%s' % i] = bid
             sheet['B%s' % i] = bname
             sheet['C%s' % i] = bprice
             sheet['D%s' % i] = tname
             sheet['E%s' % i] = bcount
             i = i + 1
         DbUtil.close_db(cursor, cnx)
         time_file = time.strftime("%Y-%m-%d-%H-%M", time.localtime())
         file_name = file_name + '-' + time_file + '.xlsx'
         wb.save(file_name)
         tk.messagebox.showinfo('成功', '导出图书信息成功')
         book_e.destroy()
Example #2
0
def build_book(frame_book):
    for widget in frame_book.winfo_children():
        widget.destroy()
    columns = ("书编号", "书名", "书价", "类别", "库存")
    treeview = ttk.Treeview(frame_book, columns=columns, show='headings')
    treeview.column("书编号", width=100, anchor='center')  # 表示列,不显示
    treeview.column("书名", width=100, anchor='center')
    treeview.column("书价", width=100, anchor='center')
    treeview.column("类别", width=100, anchor='center')
    treeview.column("库存", width=100, anchor='center')
    treeview.heading('书编号', text='书编号')
    treeview.heading('书名', text='书名')
    treeview.heading('书价', text='书价')
    treeview.heading('类别', text='类别')
    treeview.heading('库存', text='库存')
    treeview.grid()

    query = (
        "SELECT bid, bname, bprice, tname, bcount FROM `booktype`, `bookinfo` WHERE `booktype`.tid=`bookinfo`.tid")
    cnx = DbUtil.open_db()
    cursor = cnx.cursor()
    cursor.execute(query)
    i = 1
    for (bid, bname, bprice, tname, bcount) in cursor:
        treeview.insert('', i, values=(bid, bname, bprice, tname, bcount))
        i = i + 1
    DbUtil.close_db(cursor, cnx)

    for col in columns:  # 给所有标题加(循环上边的“手工”)
        treeview.heading(col, text=col, command=lambda _col=col: treeview_sort_column(treeview, _col, False))
Example #3
0
 def inserttodb():
     # 获取输入框内的内容
     tid = t_id.get().strip()
     tidn = t_id_n.get().strip()
     tnamen = t_name_n.get().strip()
     if tid == '' or tidn == '' or tnamen == '':
         tk.messagebox.showerror(message='类别编号或名称为空')
     else:
         query = ("SELECT COUNT(*) FROM `booktype` WHERE tid='%s'" % (tid))
         cnx = DbUtil.open_db()
         cursor = cnx.cursor()
         cursor.execute(query)
         if cursor.fetchone()[0] == 1:
             query = ("UPDATE `booktype` SET tid='%s', tname='%s' WHERE tid='%s'" % (tidn, tnamen, tid))
             try:
                 cursor.execute(query)
                 cnx.commit()
                 DbUtil.close_db(cursor, cnx)
                 tk.messagebox.showinfo('成功', '修改类别成功')
                 type_u.destroy()
                 build_type(frame_type)
             except:
                 tk.messagebox.showerror('错误', '修改类别失败')
         else:
             tk.messagebox.showerror('错误', '修改类别失败, 该类别编号不存在')
Example #4
0
  def update_flow_jobs(self, last_unixtime=None):
      if last_unixtime:
          flow_sql = """
            SELECT a.app_code, f.* FROM flow f JOIN cfg_application a on f.app_id = a.app_id
            WHERE from_unixtime(modified_time) >= DATE_SUB(from_unixtime(%f), INTERVAL 1 HOUR)
            """ % last_unixtime
      else:
          flow_sql = """
            SELECT a.app_code, f.* FROM flow f JOIN cfg_application a on f.app_id = a.app_id
            """
      job_sql = """
        SELECT * FROM flow_job WHERE app_id = %d and flow_id = %d
        """
      url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) +  '/wherehows/flow_jobs/_bulk'
      params = []
      self.wh_cursor.execute(flow_sql)
      rows = DbUtil.copy_dict_cursor(self.wh_cursor)
      row_count = 1
      for row in rows:
          self.wh_cursor.execute(job_sql %(long(row['app_id']), long(row['flow_id'])))
          jobs = []
          job_rows = DbUtil.copy_dict_cursor(self.wh_cursor)
          if job_rows:
              for job_row in job_rows:
                  jobs.append({"app_id": job_row['app_id'], "flow_id": job_row['flow_id'], "job_id": job_row['job_id'],
                        "job_name": job_row['job_name'] if job_row['job_name'] else '',
                        "job_path": job_row['job_path'] if job_row['job_path'] else '',
                        "job_type_id": job_row['job_type_id'],
                        "job_type": job_row['job_type'] if job_row['job_type'] else '',
                        "pre_jobs": job_row['pre_jobs'] if job_row['pre_jobs'] else '',
                        "post_jobs": job_row['post_jobs'] if job_row['post_jobs'] else '',
                        "is_current": job_row['is_current'] if job_row['is_current'] else '',
                        "is_first": job_row['is_first'] if job_row['is_first'] else '',
                        "is_last": job_row['is_last'] if job_row['is_last'] else ''})

          params.append('{ "index": { "_id": ' + str(long(row['flow_id'])*10000 + long(row['app_id'])) + '  }}')
          if len(jobs) > 0:
              params.append(
                  """{"app_id": %d,  "flow_id": %d, "app_code": "%s", "flow_name": "%s", "flow_group": "%s", "flow_path": "%s", "flow_level": %d, "is_active": "%s", "is_scheduled": "%s", "pre_flows": "%s", "jobs": %s}"""
                  % (row['app_id'], row['flow_id'], row['app_code'] if row['app_code'] else '',
                     row['flow_name'] if row['flow_name'] else '', row['flow_group'] if row['flow_group'] else '',
                     row['flow_path'] if row['flow_path'] else '', row['flow_level'],
                     row['is_active'] if row['is_active'] else '', row['is_scheduled'] if row['is_scheduled'] else '',
                     row['pre_flows'] if row['pre_flows'] else '', json.dumps(jobs)))
          else:
              params.append(
                  """{"app_id": %d,  "flow_id": %d, "app_code": "%s", "flow_name": "%s", "flow_group": "%s", "flow_path": "%s", "flow_level": %d, "is_active": "%s", "is_scheduled": "%s", "pre_flows": "%s", "jobs": ""}"""
                  % (row['app_id'], row['flow_id'], row['app_code'] if row['app_code'] else '',
                     row['flow_name'] if row['flow_name'] else '', row['flow_group'] if row['flow_group'] else '',
                     row['flow_path'] if row['flow_path'] else '', row['flow_level'],
                     row['is_active'] if row['is_active'] else '', row['is_scheduled'] if row['is_scheduled'] else '',
                     row['pre_flows'] if row['pre_flows'] else ''))
          if row_count % 1000 == 0:
              self.bulk_insert(params, url)
              self.logger.info('flow jobs' + str(row_count))
              params = []
          row_count += 1
      if len(params) > 0:
          self.logger.info('flow_jobs' + str(len(params)))
          self.bulk_insert(params, url)
Example #5
0
def usr_log_in(window, var_usr_name, var_usr_pwd):
    # 输入框获取用户名密码
    usr_name = var_usr_name.get().strip()
    usr_pwd = var_usr_pwd.get().strip()
    # 用户名密码不能为空
    if usr_name == '' or usr_pwd == '':
        tk.messagebox.showerror(message='用户名或密码不能为空!')
    else:
        # 从数据库中获取用户信息
        query = ("SELECT COUNT(*) FROM `user` WHERE username = '******'" % usr_name)
        cnx = DbUtil.open_db()
        cursor = cnx.cursor()
        cursor.execute(query)
        if cursor.fetchone()[0] == 1:
            # 判断用户名和密码是否匹配
            query = ("SELECT username, password, is_manager FROM `user` WHERE username = '******' AND password = '******'" % (
                usr_name, usr_pwd))
            cursor.execute(query)
            result = cursor.fetchone()
            DbUtil.close_db(cursor, cnx)
            if result is not None:
                # tk.messagebox.showinfo(title='welcome', message='欢迎您:' + usr_name)
                # 进入主界面
                is_manger = False if (result[2] == 0) else True
                manager_main(window, usr_name, is_manger)
            else:
                tk.messagebox.showerror(message='密码错误')
        # 不在数据库中弹出是否注册的框
        else:
            is_signup = tk.messagebox.askyesno('欢迎', '您还没有注册,是否现在注册')
            if is_signup:
                usr_sign_up()
Example #6
0
 def inserttodb():
     # 获取输入框内的内容
     bido = b_id_o.get().strip()
     bid = b_id.get().strip()
     bname = b_name.get().strip()
     bprice = b_price.get().strip()
     tid = t_id.get().strip()
     bcount = b_count.get().strip()
     if bido == '' or bid == '' or bname == '' or bprice == '' or tid == '' or bcount == '':
         tk.messagebox.showerror(message='输入框不能为空')
     else:
         query = ("SELECT COUNT(*) FROM `bookinfo` WHERE bid='%s'" % (bido))
         cnx = DbUtil.open_db()
         cursor = cnx.cursor()
         cursor.execute(query)
         if cursor.fetchone()[0] == 1:
             query = (
                 "UPDATE `bookinfo` SET bid='%s', bname='%s', bprice='%s', tid='%s', bcount='%s' WHERE tid='%s'" % (
                     bid, bname, bprice, tid, bcount, bido))
             try:
                 cursor.execute(query)
                 cnx.commit()
                 DbUtil.close_db(cursor, cnx)
                 tk.messagebox.showinfo('成功', '修改图书信息成功')
                 book_u.destroy()
                 build_book(frame_book)
             except:
                 tk.messagebox.showerror('错误', '修改图书信息失败')
         else:
             tk.messagebox.showerror('错误', '修改图书信息失败, 该图书编号不存在')
Example #7
0
  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs")
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    query = """
            SELECT a.*, b.created_time FROM
              (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time
              from WF_JOBS w LEFT JOIN WF_JOBS s
              ON w.app_path = s.app_path AND w.created_time < s.created_time
              WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a
              JOIN
              (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b
              ON a.app_path = b.app_path
            """
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      flow_record = OozieFlowRecord(self.app_id,
                                    row['app_name'],
                                    row['app_path'],
                                    0,
                                    row['source_version'],
                                    row['created_time'],
                                    row['last_modified_time'],
                                    self.wh_exec_id)
      flow_writer.append(flow_record)
      query = """
              select name, type, transition from WF_ACTIONS
              where wf_id = '{source_version}'
              """.format(source_version=row['source_version'])
      new_oz_cursor = self.oz_con.cursor()
      new_oz_cursor.execute(query)
      nodes = DbUtil.dict_cursor(new_oz_cursor)

      for node in nodes:
        job_record = OozieJobRecord(self.app_id,
                                    row['app_path'],
                                    row['source_version'],
                                    node['name'],
                                    row['app_path'] + "/" + node['name'],
                                    node['type'],
                                    self.wh_exec_id)
        job_writer.append(job_record)

        if node['transition'] != "*" and node['transition'] is not None:
          dag_edge = OozieFlowDagRecord(self.app_id,
                                        row['app_path'],
                                        row['source_version'],
                                        row['app_path'] + "/" + node['name'],
                                        row['app_path'] + "/" + node['transition'],
                                        self.wh_exec_id)
          dag_writer.append(dag_edge)
      new_oz_cursor.close()

    dag_writer.close()
    job_writer.close()
    flow_writer.close()
Example #8
0
 def insertEMPLOYEE(self, param):
     sql = """INSERT INTO EMPLOYEE(FIRST_NAME,
                 LAST_NAME, AGE, SEX, INCOME)
                 VALUES ('${FIRST_NAME}', '${LAST_NAME}', ${AGE}, '${SEX}', ${INCOME})"""
     for key in param:
         sql = sql.replace("${" + key + "}", param[key], 1)
     print sql
     DbUtil.getCursor(self.db).execute(sql)
Example #9
0
    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info("collect flow&jobs")
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        query = """
            SELECT a.*, b.created_time FROM
              (SELECT w.app_name, w.app_path, max(w.id) as source_version, max(unix_timestamp(w.last_modified_time)) as last_modified_time
              from WF_JOBS w LEFT JOIN WF_JOBS s
              ON w.app_path = s.app_path AND w.created_time < s.created_time
              WHERE s.created_time IS NULL GROUP BY w.app_name, w.app_path) a
              JOIN
              (SELECT app_path, min(unix_timestamp(created_time)) as created_time FROM WF_JOBS GROUP BY app_path) b
              ON a.app_path = b.app_path
            """
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            flow_record = OozieFlowRecord(self.app_id, row['app_name'],
                                          row['app_path'], 0,
                                          row['source_version'],
                                          row['created_time'],
                                          row['last_modified_time'],
                                          self.wh_exec_id)
            flow_writer.append(flow_record)
            query = """
              select name, type, transition from WF_ACTIONS
              where wf_id = '{source_version}'
              """.format(source_version=row['source_version'])
            new_oz_cursor = self.oz_con.cursor()
            new_oz_cursor.execute(query)
            nodes = DbUtil.dict_cursor(new_oz_cursor)

            for node in nodes:
                job_record = OozieJobRecord(
                    self.app_id, row['app_path'], row['source_version'],
                    node['name'], row['app_path'] + "/" + node['name'],
                    node['type'], self.wh_exec_id)
                job_writer.append(job_record)

                if node['transition'] != "*" and node['transition'] is not None:
                    dag_edge = OozieFlowDagRecord(
                        self.app_id, row['app_path'], row['source_version'],
                        row['app_path'] + "/" + node['name'],
                        row['app_path'] + "/" + node['transition'],
                        self.wh_exec_id)
                    dag_writer.append(dag_edge)
            new_oz_cursor.close()

        dag_writer.close()
        job_writer.close()
        flow_writer.close()
Example #10
0
    def collect_flow_execs(self, flow_exec_file, job_exec_file,
                           look_back_period):
        self.logger.info("collect flow&job executions")
        flow_exec_writer = FileWriter(flow_exec_file)
        job_exec_writer = FileWriter(job_exec_file)

        cmd = "SELECT * FROM workflow_info WHERE status is NULL"
        self.lz_cursor.execute(cmd)
        # rows = DbUtil.dict_cursor(self.lz_cursor)
        rows = DbUtil.copy_dict_cursor(self.lz_cursor)
        row_count = 0
        for row in rows:
            flow_path = row['project_name'] + ":" + row['workflow_name']
            flow_exec_record = LhotseFlowExecRecord(
                self.app_id, row["workflow_name"], flow_path, 0, 1,
                "SUCCEEDED", 1, row['owner'],
                long(time.mktime(row['create_time'].timetuple())),
                long(time.mktime(row['modify_time'].timetuple())),
                self.wh_exec_id)
            flow_exec_writer.append(flow_exec_record)

            job_exec_records = []
            task_query = "SELECT * FROM task_info WHERE workflow_id = \"{0}\"".format(
                row['workflow_id'])
            new_lz_cursor = self.lz_cursor
            new_lz_cursor.execute(task_query)
            task_rows = DbUtil.dict_cursor(new_lz_cursor)
            for task in task_rows:
                if task['real_task_id'] is None:
                    continue
                job_exec_record = LhotseJobExecRecord(
                    self.app_id, flow_path, 0, 1, task['task_name'],
                    flow_path + "/" + task['task_name'],
                    long(task['real_task_id']), 'SUCCEEDED', 1,
                    int(time.mktime(task['create_time'].timetuple())),
                    int(time.mktime(task['modify_time'].timetuple())),
                    self.wh_exec_id)
                job_exec_records.append(job_exec_record)

            ## LhotseJobExecRecord.sortAndSet(job_exec_records)
            for r in job_exec_records:
                job_exec_writer.append(r)

            row_count += 1
            if row_count % 10000 == 0:
                flow_exec_writer.flush()
                job_exec_writer.flush()

        flow_exec_writer.close()
        job_exec_writer.close()
Example #11
0
    def update_dataset_field(self, last_time):
        if last_time:
            sql = """
            SELECT * FROM dict_field_detail WHERE modified >= DATE_SUB(%s, INTERVAL 1 HOUR)
            """ % last_time
        else:
            sql = """
            SELECT * FROM dict_field_detail
          """

        comment_query = """
        SELECT d.field_id, d.dataset_id, f.comment FROM dict_dataset_field_comment d
        LEFT JOIN field_comments f ON d.comment_id = f.id WHERE d.field_id = %d
        """
        url = self.elasticsearch_index_url + ':' + str(
            self.elasticsearch_port) + '/wherehows/field/_bulk'
        params = []
        self.wh_cursor.execute(sql)
        rows = DbUtil.copy_dict_cursor(self.wh_cursor)
        row_count = 1
        for row in rows:
            self.wh_cursor.execute(comment_query % long(row['field_id']))
            comments = []
            comment_rows = DbUtil.copy_dict_cursor(self.wh_cursor)
            for comment_row in comment_rows:
                comments.append(comment_row['comment'])
            params.append('{ "index": { "_id": ' + str(row['field_id']) +
                          ', "parent": ' + str(row['dataset_id']) + '  }}')
            if len(comments) > 0:
                params.append(
                    """{ "comments": %s, "dataset_id": %d, "sort_id": %d, "field_name": "%s", "parent_path": "%s"}"""
                    % (json.dumps(comments) if comments else '',
                       row['dataset_id'] if row['dataset_id'] else 0,
                       row['sort_id'] if row['sort_id'] else 0,
                       row['field_name'] if row['field_name'] else '',
                       row['parent_path'] if row['parent_path'] else ''))
            else:
                params.append(
                    """{ "comments": "", "dataset_id": %d, "sort_id": %d, "field_name": "%s", "parent_path": "%s"}"""
                    % (row['dataset_id'] if row['dataset_id'] else 0,
                       row['sort_id'] if row['sort_id'] else 0,
                       row['field_name'] if row['field_name'] else '',
                       row['parent_path'] if row['parent_path'] else ''))
            if row_count % 1000 == 0:
                self.bulk_insert(params, url)
                params = []
            row_count += 1
        if len(params) > 0:
            self.bulk_insert(params, url)
Example #12
0
 def update_dataset(self, last_unixtime):
     if last_unixtime:
         sql = """
       SELECT * FROM dict_dataset WHERE from_unixtime(modified_time) >= DATE_SUB(from_unixtime(%f), INTERVAL 1 HOUR)
       """ % last_unixtime
     else:
         sql = """
     SELECT * FROM dict_dataset
     """
     url = self.elasticsearch_index_url + ':' + str(
         self.elasticsearch_port) + '/wherehows/dataset/_bulk'
     params = []
     self.wh_cursor.execute(sql)
     rows = DbUtil.copy_dict_cursor(self.wh_cursor)
     row_count = 1
     for row in rows:
         params.append('{ "index": { "_id": ' + str(row['id']) + ' }}')
         params.append(
             """{ "name": "%s", "source": "%s", "urn": "%s", "location_prefix": "%s", "parent_name": "%s",
       "schema_type": "%s", "properties": %s, "schema": %s , "fields": %s}"""
             % (row['name'] if row['name'] else '', row['source']
                if row['source'] else '', row['urn'] if row['urn'] else '',
                row['location_prefix'] if row['location_prefix'] else '',
                row['parent_name'] if row['parent_name'] else '',
                row['schema_type'] if row['schema_type'] else '',
                json.dumps(row['properties']) if row['properties'] else '',
                json.dumps(row['schema']) if row['schema'] else '',
                json.dumps(row['fields']) if row['fields'] else ''))
         if row_count % 1000 == 0:
             self.bulk_insert(params, url)
             params = []
         row_count += 1
     if len(params) > 0:
         self.bulk_insert(params, url)
Example #13
0
  def collect_job_execs(self, job_exec_file, lookback_period):
    self.logger.info("collect job execs")
    job_exec_writer = FileWriter(job_exec_file)
    query = """
            select  a.id as job_exec_id, a.name as job_name, j.id as flow_exec_id, a.status, a.user_retry_count,
            unix_timestamp(a.start_time) start_time, unix_timestamp(a.end_time) end_time,
            j.app_name as jname, j.app_path, transition from WF_ACTIONS a JOIN WF_JOBS j on a.wf_id = j.id where j.end_time > now() - INTERVAL %d MINUTE
            """ % (int(lookback_period))
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      job_exec_record = OozieJobExecRecord(self.app_id,
                                           row['app_path'],
                                           row['flow_exec_id'],
                                           row['flow_exec_id'],
                                           row['job_name'],
                                           row['app_path'] + "/" + row['job_name'],
                                           row['job_exec_id'],
                                           row['status'],
                                           row['user_retry_count'],
                                           row['start_time'],
                                           row['end_time'],
                                           self.wh_exec_id)
      job_exec_writer.append(job_exec_record)
    job_exec_writer.close()
Example #14
0
 def collect_flow_schedules(self, schedule_file):
   # load flow scheduling info from table triggers
   self.logger.info("collect flow schedule")
   timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
   self.aw_cursor.execute(timezone)
   schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
   self.aw_cursor.execute(schema)
   schedule_writer = FileWriter(schedule_file)
   query = \
       """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE,
          ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
          to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED,
          ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
          to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END
          FROM SO_JOB_TABLE J
          JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ
          WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """
   self.aw_cursor.execute(query)
   rows = DbUtil.dict_cursor(self.aw_cursor)
   for row in rows:
     schedule_record = AppworxFlowScheduleRecord(self.app_id,
                                                 row['SO_APPLICATION'] + ":" + row['SO_MODULE'],
                                                 row['AW_SCH_NAME'],
                                                 int(row['AW_SCH_INTERVAL']),
                                                 long(row['EFFECT_STARTED']),
                                                 long(row['EFFECT_END']),
                                                 '0',
                                                 self.wh_exec_id
                                                 )
     schedule_writer.append(schedule_record)
   schedule_writer.close()
Example #15
0
  def update_comment(self, last_time):
    if last_time:
        sql = """
          SELECT * FROM comments WHERE modified >= DATE_SUB(%s, INTERVAL 1 HOUR)
          """ % last_time
    else:
        sql = """
          SELECT * FROM comments
          """

    url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) +  '/wherehows/comment/_bulk'
    params = []
    self.wh_cursor.execute(sql)
    rows = DbUtil.copy_dict_cursor(self.wh_cursor)
    row_count = 1
    for row in rows:
      params.append('{ "index": { "_id": ' + str(row['id']) + ', "parent": ' + str(row['dataset_id']) + '  }}')
      params.append(
          """{ "text": %s, "user_id": %d, "dataset_id": %d, "comment_type": "%s"}"""
          % (json.dumps(row['text']) if row['text'] else '', row['user_id'] if row['user_id'] else 0,
             row['dataset_id'] if row['dataset_id'] else 0, row['comment_type'] if row['comment_type'] else ''))
      if row_count % 1000 == 0:
        self.bulk_insert(params, url)
        params = []
      row_count += 1
    if len(params) > 0:
      self.bulk_insert(params, url)
Example #16
0
 def collect_flow_schedules(self, schedule_file):
     # load flow scheduling info from table triggers
     self.logger.info("collect flow schedule")
     timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
     self.aw_cursor.execute(timezone)
     schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
     self.aw_cursor.execute(schema)
     schedule_writer = FileWriter(schedule_file)
     query = \
         """SELECT J.SO_APPLICATION, J.SO_MODULE, S.AW_SCH_NAME, S.AW_SCH_INTERVAL, S.AW_ACTIVE,
        ROUND((cast((FROM_TZ(CAST(S.AW_SCH_START as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
        to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_STARTED,
        ROUND((cast((FROM_TZ(CAST(S.AW_SCH_END as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
        to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as EFFECT_END
        FROM SO_JOB_TABLE J
        JOIN AW_MODULE_SCHED S ON J.SO_JOB_SEQ = S.AW_JOB_SEQ
        WHERE J.SO_COMMAND_TYPE = 'CHAIN' AND S.AW_ACTIVE = 'Y' """
     self.aw_cursor.execute(query)
     rows = DbUtil.dict_cursor(self.aw_cursor)
     for row in rows:
         schedule_record = AppworxFlowScheduleRecord(
             self.app_id, row['SO_APPLICATION'] + ":" + row['SO_MODULE'],
             row['AW_SCH_NAME'], int(row['AW_SCH_INTERVAL']),
             long(row['EFFECT_STARTED']), long(row['EFFECT_END']), '0',
             self.wh_exec_id)
         schedule_writer.append(schedule_record)
     schedule_writer.close()
Example #17
0
 def update_dataset(self, last_unixtime):
   if last_unixtime:
       sql = """
         SELECT * FROM dict_dataset WHERE from_unixtime(modified_time) >= DATE_SUB(from_unixtime(%f), INTERVAL 1 HOUR)
         """ % last_unixtime
   else:
       sql = """
       SELECT * FROM dict_dataset
       """
   url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) +  '/wherehows/dataset/_bulk'
   params = []
   self.wh_cursor.execute(sql)
   rows = DbUtil.copy_dict_cursor(self.wh_cursor)
   row_count = 1
   for row in rows:
     params.append('{ "index": { "_id": ' + str(row['id']) + ' }}')
     params.append(
         """{ "name": "%s", "source": "%s", "urn": "%s", "location_prefix": "%s", "parent_name": "%s",
         "schema_type": "%s", "properties": %s, "schema": %s , "fields": %s}"""
         % (row['name'] if row['name'] else '', row['source'] if row['source'] else '',
            row['urn'] if row['urn'] else '', row['location_prefix'] if row['location_prefix'] else '',
            row['parent_name'] if row['parent_name'] else '', row['schema_type'] if row['schema_type'] else '',
            json.dumps(row['properties'])  if row['properties'] else '',
            json.dumps(row['schema'])  if row['schema'] else '', json.dumps(row['fields'])  if row['fields'] else ''))
     if row_count % 1000 == 0:
       self.bulk_insert(params, url)
       params = []
     row_count += 1
   if len(params) > 0:
     self.bulk_insert(params, url)
Example #18
0
    def __init__(self, master):
        self.data = DbUtil.DbUtil()
        self.master = master
        self.master.resizable(0, 0)
        master.title("Menu")

        self.mainFrame = Frame(master)
        self.mainFrame.pack()

        self.label = Label(self.mainFrame, text="Wybierz kategorię")
        self.label.pack()

        #Stwórz listę kategorii
        self.selectCategoty = Listbox(self.mainFrame, selectmode=SINGLE)
        for i in self.data.data:
            self.selectCategoty.insert(0, i)
        self.selectCategoty.select_set(0)
        self.selectCategoty.pack(fill=X)
        #-----------------------#

        self.scale = Scale(self.mainFrame,
                           from_=5,
                           to=15,
                           orient=HORIZONTAL,
                           label='       Ilość słówek')
        self.scale.pack(fill=X)

        self.startButton = Button(self.mainFrame,
                                  text="Start",
                                  command=self.openCurseWindow)
        self.startButton.pack()

        self.wordCounter = 0
        self.mistakeCounter = 0
        self.gotWords = False
Example #19
0
    def update_comment(self, last_time):
        if last_time:
            sql = """
          SELECT * FROM comments WHERE modified >= DATE_SUB(%s, INTERVAL 1 HOUR)
          """ % last_time
        else:
            sql = """
          SELECT * FROM comments
          """

        url = self.elasticsearch_index_url + ':' + str(
            self.elasticsearch_port) + '/wherehows/comment/_bulk'
        params = []
        self.wh_cursor.execute(sql)
        rows = DbUtil.copy_dict_cursor(self.wh_cursor)
        row_count = 1
        for row in rows:
            params.append('{ "index": { "_id": ' + str(row['id']) +
                          ', "parent": ' + str(row['dataset_id']) + '  }}')
            params.append(
                """{ "text": %s, "user_id": %d, "dataset_id": %d, "comment_type": "%s"}"""
                % (json.dumps(row['text']) if row['text'] else '',
                   row['user_id'] if row['user_id'] else 0,
                   row['dataset_id'] if row['dataset_id'] else 0,
                   row['comment_type'] if row['comment_type'] else ''))
            if row_count % 1000 == 0:
                self.bulk_insert(params, url)
                params = []
            row_count += 1
        if len(params) > 0:
            self.bulk_insert(params, url)
Example #20
0
  def collect_flow_schedules(self, schedule_file):
    self.logger.info("collect flow schedule")
    schedule_writer = FileWriter(schedule_file)
    query = """
            SELECT DISTINCT cj.id as ref_id, cj.frequency, cj.time_unit,
            unix_timestamp(cj.start_time) as start_time, unix_timestamp(cj.end_time) as end_time,
            wj.app_path
            FROM COORD_JOBS cj JOIN COORD_ACTIONS ca ON ca.job_id = cj.id JOIN WF_JOBS wj ON ca.external_id = wj.id
            WHERE cj.status = 'RUNNING'
            """
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      schedule_record = OozieFlowScheduleRecord(self.app_id,
                                                row['app_path'],
                                                row['time_unit'],
                                                str(row['frequency']),
                                                None,
                                                row['start_time'],
                                                row['end_time'],
                                                row['ref_id'],
                                                self.wh_exec_id)
      schedule_writer.append(schedule_record)

    schedule_writer.close()
Example #21
0
 def __init__(self, configs):
     super(BackupData, self).__init__()
     self.__db = DbUtil.DbUtil(
         configs['dbhost'], Config.INPUT_DB_USERNAME,
         Config.INPUT_DB_PASSWORD, Config.INPUT_DB_DATABASE
         if configs['dbname'] is None else configs['dbname'],
         Config.INPUT_DB_CHARSET)
     self.__logger = Logger.Logger(__name__)
Example #22
0
  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info( "collect flow&job executions")
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)

    cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
    self.az_cursor.execute(cmd)
    rows = DbUtil.dict_cursor(self.az_cursor)
    row_count = 0
    for row in rows:
      json_column = 'flow_data'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except Exception as e:
        self.logger.error(e)
        pass
      flow_data = row[json_column]
      flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
      flow_exec_record = AzkabanFlowExecRecord(self.app_id,
                                               flow_data['flowId'],
                                               flow_path,
                                               row['version'],
                                               row['exec_id'],
                                               flow_data['status'],
                                               flow_data['attempt'],
                                               row['submit_user'],
                                               long(row['start_time']) / 1000,
                                               long(row['end_time']) / 1000,
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)
      nodes = flow_data['nodes']
      job_exec_records = []
      for node in nodes:
        job_exec_record = AzkabanJobExecRecord(self.app_id,
                                                flow_path,
                                                row['version'],
                                                row['exec_id'],
                                                node['id'],
                                                flow_path + "/" + node['id'],
                                                None,
                                                node['status'],
                                                node['attempt'],
                                                long(node['startTime']) / 1000,
                                                long(node['endTime']) / 1000,
                                                self.wh_exec_id)
        job_exec_records.append(job_exec_record)

      AzkabanJobExecUtil.sortAndSet(job_exec_records)
      for r in job_exec_records:
        job_exec_writer.append(r)

      row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()
    flow_exec_writer.close()
    job_exec_writer.close()
Example #23
0
  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info( "collect flow&job executions")
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)

    cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
    self.az_cursor.execute(cmd)
    rows = DbUtil.dict_cursor(self.az_cursor)
    row_count = 0
    for row in rows:
      json_column = 'flow_data'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except Exception as e:
        self.logger.error(e)
        pass
      flow_data = row[json_column]
      flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
      flow_exec_record = AzkabanFlowExecRecord(self.app_id,
                                               flow_data['flowId'],
                                               flow_path,
                                               row['version'],
                                               row['exec_id'],
                                               flow_data['status'],
                                               flow_data['attempt'],
                                               row['submit_user'],
                                               long(row['start_time']) / 1000,
                                               long(row['end_time']) / 1000,
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)
      nodes = flow_data['nodes']
      job_exec_records = []
      for node in nodes:
        job_exec_record = AzkabanJobExecRecord(self.app_id,
                                                flow_path,
                                                row['version'],
                                                row['exec_id'],
                                                node['id'],
                                                flow_path + "/" + node['id'],
                                                None,
                                                node['status'],
                                                node['attempt'],
                                                long(node['startTime']) / 1000,
                                                long(node['endTime']) / 1000,
                                                self.wh_exec_id)
        job_exec_records.append(job_exec_record)

      AzkabanJobExecUtil.sortAndSet(job_exec_records)
      for r in job_exec_records:
        job_exec_writer.append(r)

      row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()
    flow_exec_writer.close()
    job_exec_writer.close()
Example #24
0
def submit_blog():
    title, blog = (request.form['title'], request.form['blog'])
    # app.logger.debug(request.form['blog'])
    # x=open('temp.txt','w')
    # x.write(request.form['blog'])
    # x.close()
    time=datetime.datetime.now().ctime()
    res = DbUtil.save_blog(title,time,blog)
    return "res"
Example #25
0
 def saveItemInfo(self, param):
     sql = """INSERT INTO item_coupon(tao_token,coupon_short_link_url,qr_code_url,click_url,coupon_link_tao_token
                 ,coupon_link,type,item_id,short_link_url,coupon_start_time
                 ,coupon_end_time,coupon_amount,coupon_info,biz30day,tk_comm_fee
                 ,coupon_left_count,coupon_total_count,coupon_start_fee,reserve_price)
                 VALUES (${taoToken},${couponShortLinkUrl},${qrCodeUrl},${clickUrl},${couponLinkTaoToken},
                 ${couponLink},${type},${itemid},${shortLinkUrl},${couponEffectiveStartTime},
                 ${couponEffectiveEndTime},${couponAmount},${couponInfo},${biz30day},${tkCommFee}
                 ,${couponLeftCount},${couponTotalCount},${couponStartFee},${reservePrice})"""
     for key in param:
         value = ''
         if param[key] == None:
             value = 'Null'
         else:
             value = "'" + param[key] + "'"
         sql = sql.replace("${" + key + "}", value, 1)
     print sql
     DbUtil.getCursor(self.db).execute(sql)
Example #26
0
 def __init__(self):
     super(CreateTable, self).__init__()
     self.__configs = CommonUtil.parse_arguments()
     self.__db = DbUtil.DbUtil(
         self.__configs['dbhost'], Config.INPUT_DB_USERNAME,
         Config.INPUT_DB_PASSWORD, Config.INPUT_DB_DATABASE
         if configs['dbname'] is None else configs['dbname'],
         Config.INPUT_DB_CHARSET)
     self.__logger = Logger.Logger(__name__)
Example #27
0
 def __init__(self, configs):
     super(CalcTeacherMetric, self).__init__()
     self.__db = DbUtil.DbUtil(
         configs['dbhost'], Config.INPUT_DB_USERNAME,
         Config.INPUT_DB_PASSWORD, Config.INPUT_DB_DATABASE
         if configs['dbname'] is None else configs['dbname'],
         Config.INPUT_DB_CHARSET)
     self.__logger = Logger.Logger(__name__)
     self.__delimiter = '@'
Example #28
0
  def update_dataset_field(self, last_time):
      if last_time:
          sql = """
            SELECT * FROM dict_field_detail WHERE modified >= DATE_SUB(%s, INTERVAL 1 HOUR)
            """ % last_time
      else:
          sql = """
            SELECT * FROM dict_field_detail
          """

      comment_query = """
        SELECT d.field_id, d.dataset_id, f.comment FROM dict_dataset_field_comment d
        LEFT JOIN field_comments f ON d.comment_id = f.id WHERE d.field_id = %d
        """
      url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port)  + '/wherehows/field/_bulk'
      params = []
      self.wh_cursor.execute(sql)
      rows = DbUtil.copy_dict_cursor(self.wh_cursor)
      row_count = 1
      for row in rows:
          self.wh_cursor.execute(comment_query % long(row['field_id']))
          comments = []
          comment_rows = DbUtil.copy_dict_cursor(self.wh_cursor)
          for comment_row in comment_rows:
            comments.append(comment_row['comment'])
          params.append('{ "index": { "_id": ' +
                        str(row['field_id']) + ', "parent": ' + str(row['dataset_id']) + '  }}')
          if len(comments) > 0:
            params.append(
                """{ "comments": %s, "dataset_id": %d, "sort_id": %d, "field_name": "%s", "parent_path": "%s"}"""
                % (json.dumps(comments) if comments else '', row['dataset_id'] if row['dataset_id'] else 0,
                   row['sort_id'] if row['sort_id'] else 0,
                   row['field_name'] if row['field_name'] else '', row['parent_path'] if row['parent_path'] else ''))
          else:
            params.append(
                """{ "comments": "", "dataset_id": %d, "sort_id": %d, "field_name": "%s", "parent_path": "%s"}"""
                % (row['dataset_id'] if row['dataset_id'] else 0, row['sort_id'] if row['sort_id'] else 0,
                   row['field_name'] if row['field_name'] else '', row['parent_path'] if row['parent_path'] else ''))
          if row_count % 1000 == 0:
              self.bulk_insert(params, url)
              params = []
          row_count += 1
      if len(params) > 0:
          self.bulk_insert(params, url)
Example #29
0
    def signtowcg():
        # 获取输入框内的内容
        nun = new_username.get().strip()
        np = new_pwd.get().strip()
        npf = new_pwd_confirm.get().strip()
        nn = new_name.get().strip()
        ng = new_gender.get().strip()
        ne = new_email.get().strip()
        nt = new_telephone.get().strip()
        nm = new_manager.get().strip()

        if np == '' or nun == '' or npf == '' or nn == '' or ng == '' or ne == '' or nt == '' or nm == '':
            tk.messagebox.showerror('错误', '输入框不能为空!')
        elif np != npf:
            tk.messagebox.showerror('错误', '密码前后不一致')
        elif re.match(r'^[a-zA-Z0-9_.-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)*\.[a-zA-Z0-9]{2,6}$', ne) is None:
            tk.messagebox.showerror('错误', '邮箱格式不正确')
        elif re.match(r'^1([38][0-9]|4[579]|5[0-3,5-9]|6[6]|7[0135678]|9[89])\d{8}$', nt) is None:
            tk.messagebox.showerror('错误', '手机号格式不正确')
        else:
            query = ("SELECT COUNT(*) FROM `user` WHERE username = '******'" % (nun))
            cnx = DbUtil.open_db()
            cursor = cnx.cursor()
            cursor.execute(query)
            if cursor.fetchone()[0] != 0:
                tk.messagebox.showerror('错误', '用户名已存在')
            else:
                query = ("INSERT INTO `user`(username, password, is_manager) VALUES ('%s', '%s', '%s')" % (nun, np, nm))
                query1 = (
                    "INSERT INTO `userinfo`(username, password, `name`, gender, email, telephone) VALUES ('%s', '%s', '%s', '%s', '%s', '%s')" % (
                        nun, np, nn, ng, ne, nt))
                try:
                    cursor.execute(query)
                    cursor.execute(query1)
                    cnx.commit()
                    DbUtil.close_db(cursor, cnx)
                    tk.messagebox.showinfo('欢迎', '注册成功')
                    # 注册成功关闭注册框
                    window_sign_up.destroy()
                except:
                    print()
                    tk.messagebox.showinfo('错误', '注册失败')
                    cnx.rollback()
Example #30
0
 def inserttodb():
     # 获取输入框内的内容
     tid = t_id.get().strip()
     tname = t_name.get().strip()
     if tid == '' or tname == '':
         tk.messagebox.showerror(message='类别编号或名称为空')
     else:
         query = ("INSERT INTO `booktype`(tid, tname) VALUES ('%s', '%s')" % (tid, tname))
         try:
             cnx = DbUtil.open_db()
             cursor = cnx.cursor()
             cursor.execute(query)
             cnx.commit()
             DbUtil.close_db(cursor, cnx)
             tk.messagebox.showinfo('成功', '新增类别成功')
             type_c.destroy()
             build_type(frame_type)
         except:
             tk.messagebox.showerror('错误', '新增类别失败')
  def db_lookup(self, dbname, default=None):
    query = \
        """
        SELECT db_id FROM cfg_database WHERE db_code = '%s' or short_connection_string = '%s'
        """
    self.aw_cursor.execute(query % (dbname,dbname))
    rows = DbUtil.dict_cursor(self.aw_cursor)
    for row in rows:
      return row['db_id']

    return 0
  def db_lookup(self, dbname, default=None):
    query = \
        """
        SELECT db_id FROM cfg_database WHERE db_code = '%s' or short_connection_string = '%s'
        """
    self.aw_cursor.execute(query % (dbname,dbname))
    rows = DbUtil.dict_cursor(self.aw_cursor)
    for row in rows:
      return row['db_id']

    return 0
Example #33
0
    def collect_flow_schedules(self, schedule_file):
        # load flow scheduling info from table triggers
        self.logger.info("collect flow schedule")
        schedule_writer = FileWriter(schedule_file)
        query = "select * from triggers"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)
        for row in rows:
            json_column = 'data'
            if row[json_column] != None:
                unzipped_content = gzip.GzipFile(
                    mode='r',
                    fileobj=StringIO.StringIO(
                        row[json_column].tostring())).read()
                try:
                    row[json_column] = json.loads(unzipped_content)
                except Exception as e:
                    self.logger.error(e)
                    pass

                if not "projectId" in row[json_column]["actions"][0][
                        "actionJson"]:
                    continue
                # print json.dumps(row[json_column], indent=4)

                if row[json_column]["triggerCondition"]["checkers"][0][
                        "checkerJson"]["isRecurring"] == 'true':
                    unit, frequency, cron_expr = None, None, None
                    period = row[json_column]["triggerCondition"]["checkers"][
                        0]["checkerJson"]["period"]
                    if period is not None and period != "null" and period[
                            -1:] in self._period_unit_table:
                        unit = self._period_unit_table[period[-1:]]
                        frequency = int(
                            row[json_column]["triggerCondition"]["checkers"][0]
                            ["checkerJson"]["period"][:-1])
                    if "cronExpression" in row[json_column][
                            "triggerCondition"]["checkers"][0]["checkerJson"]:
                        cron_expr = row[json_column]["triggerCondition"][
                            "checkers"][0]["checkerJson"]["cronExpression"]
                    schedule_record = AzkabanFlowScheduleRecord(
                        self.app_id, row[json_column]["actions"][0]
                        ["actionJson"]["projectName"] + ':' + row[json_column]
                        ["actions"][0]["actionJson"]["flowName"], unit,
                        frequency, cron_expr,
                        long(row[json_column]["triggerCondition"]["checkers"]
                             [0]["checkerJson"]["firstCheckTime"]) / 1000,
                        int(
                            time.mktime(
                                datetime.date(2099, 12, 31).timetuple())), '0',
                        self.wh_exec_id)
                    schedule_writer.append(schedule_record)
        schedule_writer.close()
Example #34
0
 def getItemInfoInDB(self):
     sql = """SELECT tao_token as taoToken,
                     click_url as clickUrl,
                     coupon_link_tao_token as couponLinkTaoToken,
                     coupon_link as couponLink,
                     short_link_url as shortLinkUrl,
                     coupon_short_link_url as couponShortLinkUrl
                     FROM  item_coupon 
                     where item_id = ${itemid}"""
     cur = DbUtil.getCursor(self.db)
     cur.execute(sql)
     return cur.fetchone()
Example #35
0
 def __init__(self, configs):
     super(Preprocessor, self).__init__()
     self.__db = DbUtil.DbUtil(
         configs['dbhost'], Config.INPUT_DB_USERNAME,
         Config.INPUT_DB_PASSWORD, Config.INPUT_DB_DATABASE
         if configs['dbname'] is None else configs['dbname'],
         Config.INPUT_DB_CHARSET)
     self.__logger = Logger.Logger(__name__)
     self.__teaching = True if configs[
         'teaching'] == 1 else False  # teaching=1 表示评估课堂的教学情况
     self.__teacher = True if configs[
         'teaching'] == 2 else False  # teaching=2 表示基于S-T评估教师的教学情况
Example #36
0
    def collect_flow_owners(self, owner_file):
        self.logger.info("collect owners")
        owner_writer = FileWriter(owner_file)
        query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS"
        self.oz_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.oz_cursor)

        for row in rows:
            owner_record = OozieFlowOwnerRecord(self.app_id, row['app_path'],
                                                row['user_name'],
                                                self.wh_exec_id)
            owner_writer.append(owner_record)
        owner_writer.close()
Example #37
0
 def update_metric(self):
     sql = """
       SELECT * FROM dict_business_metric
       """
     url = self.elasticsearch_index_url + ':' + str(self.elasticsearch_port) +  '/wherehows/metric/_bulk'
     params = []
     self.wh_cursor.execute(sql)
     rows = DbUtil.copy_dict_cursor(self.wh_cursor)
     row_count = 1
     for row in rows:
         params.append('{ "index": { "_id": ' + str(row['metric_id']) + '  }}')
         params.append(
             """{"metric_id": %d,  "metric_name": %s, "metric_description": %s, "dashboard_name": %s,
             "metric_group": %s, "metric_category": %s, "metric_sub_category": %s, "metric_level": %s,
             "metric_source_type": %s, "metric_source": %s, "metric_source_dataset_id": %d,
             "metric_ref_id_type": %s, "metric_ref_id": %s, "metric_type": %s, "metric_additive_type": %s,
             "metric_grain": %s, "metric_display_factor": %f, "metric_display_factor_sym": %s,
             "metric_good_direction": %s, "metric_formula": %s, "dimensions": %s, "owners": %s, "tags": %s,
             "urn": %s, "metric_url": %s, "wiki_url": %s, "scm_url": %s}"""
             % (row['metric_id'], json.dumps(row['metric_name']) if row['metric_name'] else json.dumps(''),
                json.dumps(row['metric_description']) if row['metric_description'] else json.dumps(''),
                json.dumps(row['dashboard_name']) if row['dashboard_name'] else json.dumps(''),
                json.dumps(row['metric_group']) if row['metric_group'] else json.dumps(''),
                json.dumps(row['metric_category']) if row['metric_category'] else json.dumps(''),
                json.dumps(row['metric_sub_category']) if row['metric_sub_category'] else json.dumps(''),
                json.dumps(row['metric_level']) if row['metric_level'] else json.dumps(''),
                json.dumps(row['metric_source_type']) if row['metric_source_type'] else json.dumps(''),
                json.dumps(row['metric_source']) if row['metric_source'] else json.dumps(''),
                row['metric_source_dataset_id'] if row['metric_source_dataset_id'] else 0,
                json.dumps(row['metric_ref_id_type']) if row['metric_ref_id_type'] else json.dumps(''),
                json.dumps(row['metric_ref_id']) if row['metric_ref_id'] else json.dumps(''),
                json.dumps(row['metric_type']) if row['metric_type'] else json.dumps(''),
                json.dumps(row['metric_additive_type']) if row['metric_additive_type'] else json.dumps(''),
                json.dumps(row['metric_grain']) if row['metric_grain'] else json.dumps(''),
                row['metric_display_factor'] if row['metric_display_factor'] else 0.0,
                json.dumps(row['metric_display_factor_sym']) if row['metric_display_factor_sym'] else json.dumps(''),
                json.dumps(row['metric_good_direction']) if row['metric_good_direction'] else json.dumps(''),
                json.dumps(row['metric_formula']) if row['metric_formula'] else json.dumps(''),
                json.dumps(row['dimensions']) if row['dimensions'] else json.dumps(''),
                json.dumps(row['owners']) if row['owners'] else json.dumps(''),
                json.dumps(row['tags']) if row['tags'] else json.dumps(''),
                json.dumps(row['urn']) if row['urn'] else json.dumps(''),
                json.dumps(row['metric_url']) if row['metric_url'] else json.dumps(''),
                json.dumps(row['wiki_url']) if row['wiki_url'] else json.dumps(''),
                json.dumps(row['scm_url']) if row['scm_url'] else json.dumps('')))
         if row_count % 1000 == 0:
           self.bulk_insert(params, url)
           params = []
         row_count += 1
     if len(params) > 0:
         self.bulk_insert(params, url)
Example #38
0
 def delete_type_by_id():
     # 获取输入框内的内容
     tid = t_id.get().strip()
     if tid == '':
         tk.messagebox.showerror(message='类别编号为空')
     else:
         query = ("SELECT COUNT(*) FROM `booktype` WHERE tid='%s'" % (tid))
         cnx = DbUtil.open_db()
         cursor = cnx.cursor()
         cursor.execute(query)
         if cursor.fetchone()[0] == 1:
             query = ("DELETE FROM `booktype` WHERE tid='%s'" % (tid))
             try:
                 cursor.execute(query)
                 cnx.commit()
                 DbUtil.close_db(cursor, cnx)
                 tk.messagebox.showinfo('成功', '删除类别成功')
                 type_d.destroy()
                 build_type(frame_type)
             except:
                 tk.messagebox.showerror('错误', '删除类别失败')
         else:
             tk.messagebox.showerror('错误', '删除类别失败, 该类别编号不存在')
Example #39
0
  def collect_flow_owners(self, owner_file):
    self.logger.info("collect owners")
    owner_writer = FileWriter(owner_file)
    query = "SELECT DISTINCT app_name, app_path, user_name from WF_JOBS"
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      owner_record = OozieFlowOwnerRecord(self.app_id,
                                          row['app_path'],
                                          row['user_name'],
                                          self.wh_exec_id)
      owner_writer.append(owner_record)
    owner_writer.close()
Example #40
0
def saveCategoryToDb(categoryList=None):
    print '--- saveCategoryToDb start ---'
    if categoryList is None or len(categoryList) == 0:
        print "categoryList is None could not save to db."
        return
    else:
        insert = 'insert into sh_category (cate_id, cate_name, parent_id, parent_cate_name, sort_num, cate_item_num,' \
                 ' description, listUrl, shopName, shopLogo, update_time) '
        sql_select_str = ''
        currentTime = int(time.time())
        for category in categoryList:
            sql_select_str += "SELECT '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' union " \
                              % (category.cate_id, category.cate_name, category.parent_id, category.parent_cate_name,
                                 category.sort_num, category.cate_item_num, category.description, category.listUrl,
                                 category.shopName, category.shopLogo, currentTime)
        # 拼接sql 语句
        insert = insert + sql_select_str
        # 截取字符串
        insert = insert[:-6]
        # 保存category 需要先清空数据
        delete = 'delete from sh_category;'
        DbUtil.delete(delete)
        DbUtil.insert(insert)
Example #41
0
 def inserttodb():
     # 获取输入框内的内容
     bid = b_id.get().strip()
     bname = b_name.get().strip()
     bprice = b_price.get().strip()
     tid = t_id.get().strip()
     bcount = b_count.get().strip()
     if bid == '' or bname == '' or bprice == '' or tid == '' or bcount == '':
         tk.messagebox.showerror(message='输入框不能为空')
     else:
         query = ("INSERT INTO `bookinfo`(bid, bname, bprice, tid, bcount) VALUES ('%s', '%s', '%s', '%s', '%s')" % (
             bid, bname, bprice, tid, bcount))
         try:
             cnx = DbUtil.open_db()
             cursor = cnx.cursor()
             cursor.execute(query)
             cnx.commit()
             DbUtil.close_db(cursor, cnx)
             tk.messagebox.showinfo('成功', '新增图书信息成功')
             book_c.destroy()
             build_book(frame_book)
         except:
             tk.messagebox.showerror('错误', '新增图书信息失败')
Example #42
0
 def delete_book_by_id():
     # 获取输入框内的内容
     bid = b_id.get().strip()
     if bid == '':
         tk.messagebox.showerror(message='图书编号为空')
     else:
         query = ("SELECT COUNT(*) FROM `bookinfo` WHERE bid='%s'" % (bid))
         cnx = DbUtil.open_db()
         cursor = cnx.cursor()
         cursor.execute(query)
         if cursor.fetchone()[0] == 1:
             query = ("DELETE FROM `bookinfo` WHERE bid='%s'" % (bid))
             try:
                 cursor.execute(query)
                 cnx.commit()
                 DbUtil.close_db(cursor, cnx)
                 tk.messagebox.showinfo('成功', '删除图书成功')
                 book_d.destroy()
                 build_book(frame_book)
             except:
                 tk.messagebox.showerror('错误', '删除图书失败')
         else:
             tk.messagebox.showerror('错误', '删除图书失败, 该图书编号不存在')
Example #43
0
    def collect_flow_schedules(self, schedule_file):
        # load flow scheduling info from table triggers
        self.logger.info("collect flow schedule")
        schedule_writer = FileWriter(schedule_file)
        query = "select * from triggers"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)
        for row in rows:
            json_column = "data"
            if row[json_column] != None:
                unzipped_content = gzip.GzipFile(
                    mode="r", fileobj=StringIO.StringIO(row[json_column].tostring())
                ).read()
                try:
                    row[json_column] = json.loads(unzipped_content)
                except Exception as e:
                    self.logger.error(e)
                    pass

                if not "projectId" in row[json_column]["actions"][0]["actionJson"]:
                    continue
                # print json.dumps(row[json_column], indent=4)

                if row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["isRecurring"] == "true":
                    unit, frequency, cron_expr = None, None, None
                    period = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"]
                    if period is not None and period != "null" and period[-1:] in self._period_unit_table:
                        unit = self._period_unit_table[period[-1:]]
                        frequency = int(
                            row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["period"][:-1]
                        )
                    if "cronExpression" in row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]:
                        cron_expr = row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["cronExpression"]
                    schedule_record = AzkabanFlowScheduleRecord(
                        self.app_id,
                        row[json_column]["actions"][0]["actionJson"]["projectName"]
                        + ":"
                        + row[json_column]["actions"][0]["actionJson"]["flowName"],
                        unit,
                        frequency,
                        cron_expr,
                        long(row[json_column]["triggerCondition"]["checkers"][0]["checkerJson"]["firstCheckTime"])
                        / 1000,
                        int(time.mktime(datetime.date(2099, 12, 31).timetuple())),
                        "0",
                        self.wh_exec_id,
                    )
                    schedule_writer.append(schedule_record)
        schedule_writer.close()
Example #44
0
  def get_last_execution_unix_time(self):
    if self.last_execution_unix_time is None:
      try:
        query = """
            SELECT MAX(end_time) as last_time FROM job_execution where app_id = %d
            """
        self.wh_cursor.execute(query % self.app_id)
        rows = DbUtil.dict_cursor(self.wh_cursor)
        if rows:
          for row in rows:
            self.last_execution_unix_time = long(row['last_time'])
            break
      except:
        self.logger.error("Get the last execution time from job_execution failed")
        self.last_execution_unix_time = None

    return self.last_execution_unix_time
Example #45
0
  def get_last_execution_unix_time(self):
    if self.last_execution_unix_time is None:
      try:
        query = """
          SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage
          """
        self.wh_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.wh_cursor)
        if rows:
          for row in rows:
            self.last_execution_unix_time = row['last_time']
            break
      except:
        self.logger.error("Get the last execution time from job_execution_data_lineage failed")
        self.last_execution_unix_time = None

    return self.last_execution_unix_time
Example #46
0
  def collect_flow_owners(self, owner_file):
    # load user info from table project_permissions
    self.logger.info("collect owner&permissions")
    user_writer = FileWriter(owner_file)
    query = "select f.flow_id, p.name as project_name, p.version as project_verison, pp.name as owner, pp.permissions, pp.isGroup " \
            "from project_flows f join project_permissions pp on f.project_id = pp.project_id join projects p on f.project_id = p.id where p.active = 1"
    self.az_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.az_cursor)

    for row in rows:
      record = AzkabanFlowOwnerRecord(self.app_id,
                                      row['project_name'] + ':' + row["flow_id"],
                                      row["owner"],
                                      AzkabanPermission(row["permissions"]).toFlatString(),
                                      'GROUP' if row['isGroup'] == 1 else 'LDAP',
                                      self.wh_exec_id)
      user_writer.append(record)
    user_writer.close()
  def get_last_execution_unix_time(self):
    if self.last_execution_unix_time is None:
      try:
        query = """
          SELECT MAX(job_finished_unixtime) as last_time FROM job_execution_data_lineage where app_id = %d
          """
        self.aw_cursor.execute(query % self.app_id)
        rows = DbUtil.dict_cursor(self.aw_cursor)
        if rows:
          for row in rows:
            self.last_execution_unix_time = row['last_time']
            break
      except:
        self.logger.error("Get the last execution time from job_execution_data_lineage failed")
        self.last_execution_unix_time = None

      ts = int(time.time())
      if self.last_execution_unix_time is not None and (ts - self.last_execution_unix_time) > 5*60*60:
        self.logger.info('last execution unix time is:' + str(self.last_execution_unix_time))
        self.last_execution_unix_time = None
    return self.last_execution_unix_time
  def get_log_file_name(self, module_name, days_offset=1):
    if self.last_execution_unix_time:
      query = \
        """select je.*, fj.job_type, fl.flow_path,
           CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)) as log_file_name,
           CONCAT('%s', DATE_FORMAT(FROM_UNIXTIME(je.end_time), '%%Y%%m%%d'), '/',
                  CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)), '.gz') as gzipped_file_name
           from job_execution je
           JOIN flow_job fj on je.app_id = fj.app_id and je.flow_id = fj.flow_id and fj.is_current = 'Y' and
           je.job_id = fj.job_id
           JOIN flow fl on fj.app_id = fl.app_id and fj.flow_id = fl.flow_id
           WHERE je.app_id = %d
           and je.end_time >= %d - 3660
           and fj.job_type = '%s'
           ORDER BY je.flow_exec_id DESC, je.job_exec_id
        """
      self.logger.info(query % (self.aw_archive_dir, self.app_id, long(self.last_execution_unix_time), module_name))
      self.aw_cursor.execute(query %
                             (self.aw_archive_dir, self.app_id, long(self.last_execution_unix_time), module_name))
    else:
      query = \
        """select je.*, fj.job_type, fl.flow_path,
           CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)) as log_file_name,
           CONCAT('%s', DATE_FORMAT(FROM_UNIXTIME(je.end_time), '%%Y%%m%%d'), '/',
                  CONCAT('o', je.job_exec_id, '.', LPAD(je.attempt_id, 2, 0)), '.gz') as gzipped_file_name
           from job_execution je
           JOIN flow_job fj on je.app_id = fj.app_id and je.flow_id = fj.flow_id and fj.is_current = 'Y' and
           je.job_id = fj.job_id
           JOIN flow fl on fj.app_id = fl.app_id and fj.flow_id = fl.flow_id
           WHERE je.app_id = %d and je.job_exec_status in ('FINISHED', 'SUCCEEDED', 'OK')
           and je.end_time >= unix_timestamp(CURRENT_DATE - INTERVAL %d DAY) and fj.job_type = '%s'
           ORDER BY je.flow_exec_id DESC, je.job_exec_id
        """
      self.logger.info(query % (self.aw_archive_dir, self.app_id, int(self.look_back_days), module_name))
      self.aw_cursor.execute(query % (self.aw_archive_dir, self.app_id, int(self.look_back_days), module_name))
    job_rows = DbUtil.copy_dict_cursor(self.aw_cursor)
    self.logger.info("%d job executions will be scanned for lineage" % len(job_rows))

    return job_rows
Example #49
0
  def collect_flow_execs(self, flow_exec_file, lookback_period):
    self.logger.info("collect flow execs")
    flow_exec_writer = FileWriter(flow_exec_file)
    query = "select id, app_name, app_path, unix_timestamp(start_time) as start_time, unix_timestamp(end_time) as end_time, run, status, user_name from WF_JOBS where end_time > now() - INTERVAL %d MINUTE" % (int(lookback_period))
    self.oz_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.oz_cursor)

    for row in rows:
      flow_exec_record = OozieFlowExecRecord(self.app_id,
                                             row['app_name'],
                                             row['app_path'],
                                             row['id'],
                                             row['id'],
                                             row['status'],
                                             row['run'],
                                             row['user_name'],
                                             row['start_time'],
                                             row['end_time'],
                                             self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)

    flow_exec_writer.close()
Example #50
0
  def collect_flow_owners(self, owner_file):
    self.logger.info("collect owner&permissions")
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    user_writer = FileWriter(owner_file)
    query = \
        """SELECT DISTINCT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, U.SO_USER_NAME FROM SO_JOB_TABLE J
             JOIN SO_JOB_HISTORY H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
             JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
             WHERE J.SO_COMMAND_TYPE = 'CHAIN' """
    self.aw_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.aw_cursor)

    for row in rows:
      record = AppworxFlowOwnerRecord(self.app_id,
                                      row['SO_APPLICATION'] + ':' + row["SO_MODULE"],
                                      row["SO_USER_NAME"],
                                      'EXECUTE',
                                      'GROUP',
                                      self.wh_exec_id)
      user_writer.append(record)
    user_writer.close()
  def process_li_getreplacemergegeneral(self, module_name):
    self.logger.info("process %s" % module_name)
    if module_name not in ['LI_GETREPLACEMERGEGENERAL', 'LINKEDIN_SHELL','LI_WEBHDFS_GET']: return
    parameter_query = \
        """
        SELECT GROUP_CONCAT(param_value,'\x01') args
        FROM   job_parameter
        WHERE  app_id = %d
        AND    job_exec_id = %d
        AND    attempt_number = %d
        ORDER BY param_no
        """
    rows = self.get_log_file_name(module_name=module_name)
    for row in rows:
      try:
        self.logger.info('Parsing log file: %s' % row['gzipped_file_name'])
        if module_name == 'LI_GETREPLACEMERGEGENERAL':
          self.aw_cursor.execute(parameter_query %
                                   (int(row['app_id']), int(row['job_exec_id']), int(row['attempt_id'])))
          arg_values = DbUtil.copy_dict_cursor(self.aw_cursor)
          if arg_values and len(arg_values) > 0:
            args = arg_values[0]['args']

        results = AppworxLogParser(log_file_name = row['gzipped_file_name']).parse_log({}, command_type=module_name)
        if any(results) == 0 or not 'cluster' in results:
          self.logger.info('Skipped parsing %s' % row['gzipped_file_name'])
          continue
        self.logger.info(str(results))
        matched_cluster = re.match(r'(.*)_.*', results['cluster'])
        if matched_cluster is not None:
          results['cluster'] = matched_cluster.group(1)
        db_id = int(self.db_lookup(results['cluster']))
        self.logger.info(str(db_id))
        self.clean_up_staging_lineage_dbs(row['app_id'], row['job_id'], row['job_exec_id'], row['attempt_id'])
        update_source_code_query = \
          """
          INSERT INTO job_attempt_source_code(
            application_id, job_id, attempt_number, script_name, script_path, script_type, created_date)
          VALUES( %d, %d, %d, '%s', '%s', '%s', now())
          """ % (int(row['app_id']), int(row['job_id']), int(row['attempt_id']),
                 results['script_name'], results['script_path'],
                 results['script_type'] if module_name == 'LI_WEBHDFS_GET' else 'Shell')
        update_staging_lineage_query = \
          """
          INSERT IGNORE INTO stg_job_execution_data_lineage (
            app_id, flow_exec_id, job_exec_id, flow_path, job_name, job_start_unixtime, job_finished_unixtime,
            db_id, abstracted_object_name, full_object_name, partition_start, partition_end, partition_type,
            storage_type, source_target_type, srl_no, source_srl_no, operation, record_count, insert_count,
            created_date, wh_etl_exec_id)
            SELECT %d, %d, %d, '%s', '%s', %d, %d, %d, '%s', '%s',
            CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
            CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
            CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
            '%s', '%s', %d,
            CASE WHEN %d = 0 THEN NULL ELSE %d END,
            '%s', %d, %d, UNIX_TIMESTAMP(now()), %d
          FROM dual
          """
        self.aw_cursor.execute(update_source_code_query )
        self.aw_con.commit()

        for k, tab in enumerate(results['table']):
          self.aw_cursor.execute(update_staging_lineage_query %
                                (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']),
                                 row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']),
                                 db_id if tab['table_type'] == 'source' else 0, tab['abstracted_path'], tab['full_path'],
                                 tab['start_partition'], tab['start_partition'],
                                 tab['end_partition'] if tab['frequency'] != 'snapshot' else None,
                                 tab['end_partition'] if tab['frequency'] != 'snapshot' else None,
                                 tab['frequency'], tab['frequency'], tab['storage_type'],
                                 tab['table_type'], k + 1, 0, 0,
                                 'Hadoop Get' if tab['table_type'] == 'source' else 'Move',
                                0, 0, int(row['wh_etl_exec_id']) ))
        self.aw_con.commit()
        self.logger.debug('Completed processing metadata for log file: %s' % row['gzipped_file_name'])
      except:
        self.logger.error(str(sys.exc_info()[0]))
  def process_li_shell_gw(self):
    self.logger.info("process li_shell_gw")
    parameter_query = \
        """
        SELECT param_value
        FROM   job_parameter
        WHERE  app_id = %d
        AND    job_exec_id = %d
        AND    attempt_number = %d
        ORDER BY param_no
        """

    rows = self.get_log_file_name(module_name='LI_SHELL_GW')
    self.logger.info(str(len(rows)))
    for row in rows:
      self.aw_cursor.execute(parameter_query %
                                         (int(row['app_id']), int(row['job_exec_id']), int(row['attempt_id'])))
      arg_values = DbUtil.copy_dict_cursor(self.aw_cursor)
      if arg_values and len(arg_values) > 0 and 'param_value' in arg_values[0]:
        args = arg_values[0]['param_value'].replace('"','')
        m = re.match("(.*?/trim.sh)\s+\d+\s+\d+", args)
        if m is not None:
          shell_script_name = os.path.basename(m.group(1))
          shell_script_path = os.path.dirname(m.group(1))
          app_path = shell_script_path.replace(self.remote_hadoop_script_dir,'')
          local_shell_script_path = self.local_script_path + self.remote_script_path + app_path
          self.logger.info(local_shell_script_path + '/' + shell_script_name)
          shell_script_content = open(local_shell_script_path + '/' + shell_script_name).read()
          pig_script = re.search(r'pig\s+\-f\s+(.*?)\s+',shell_script_content)
          if pig_script is not None:
            pig_script_name = os.path.basename(pig_script.group(1))
            pig_script_path = os.path.dirname(pig_script.group(1))
            # Compare md5 string of the file with the one parsed last time
            md5_str = hashlib.md5()
            md5_str.update(shell_script_content)

            self.logger.info('Parsing log file: %s' % row['gzipped_file_name'])
            src_tgt_map = PigLogParser(log_file_name=row['gzipped_file_name']).simple_parser()
            if any(src_tgt_map) == 0:
              self.logger.warn('Pig log parser fails to retrieve relevant information from file: %s. Most probably Pig script did not complete successfully' % row['gzipped_file_name'])
              continue
            self.logger.info(str(src_tgt_map))
            db_id = self.db_lookup(src_tgt_map['cluster'])
            self.clean_up_staging_lineage_dbs(row['app_id'], row['job_id'], row['job_exec_id'], row['attempt_id'])
            update_source_code_query = \
                """
                INSERT INTO job_attempt_source_code(
                  application_id, job_id, attempt_number, script_name, script_path, script_type, created_date)
                VALUES( %d, %d, %d, '%s', '%s', '%s', now())
                """ % (int(row['app_id']), int(row['job_id']), int(row['attempt_id']),
                       pig_script_name, pig_script_path, 'Pig')
            self.aw_cursor.execute(update_source_code_query )
            update_staging_lineage_query = \
                """
                INSERT IGNORE INTO stg_job_execution_data_lineage (
                  app_id, flow_exec_id, job_exec_id, flow_path, job_name, job_start_unixtime, job_finished_unixtime,
                  db_id, abstracted_object_name, full_object_name, partition_start, partition_end, partition_type,
                  storage_type, source_target_type, srl_no, source_srl_no, operation, record_count, insert_count,
                  created_date, wh_etl_exec_id)
                  SELECT %d, %d, %d, '%s', '%s', %d, %d, %d, '%s', '%s',
                  CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
                  CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
                  CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
                  '%s', '%s', %d,
                  CASE WHEN %d = 0 THEN NULL ELSE %d END,
                  '%s', %d, %d, UNIX_TIMESTAMP(now()), %d
                FROM dual
                """
            srl = 0
            detail_srl = 1
            for k,v in src_tgt_map.items():
              if k in ['cluster'] or k in ['hive-cluster']: continue
              srl = srl + 1
              self.aw_cursor.execute(update_staging_lineage_query %
                                     (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']),
                                      row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']),
                                      db_id, src_tgt_map[k]['abstracted_hdfs_path'], None,
                                      src_tgt_map[k]['min_start_partition'], src_tgt_map[k]['min_start_partition'],
                                      src_tgt_map[k]['max_end_partition'] if src_tgt_map[k]['table_type'] != 'snapshot' else None,
                                      src_tgt_map[k]['max_end_partition'] if src_tgt_map[k]['table_type'] != 'snapshot' else None,
                                      src_tgt_map[k]['partition_type'], src_tgt_map[k]['partition_type'], 'HDFS',
                                      src_tgt_map[k]['table_type'], int(srl), 0, 0,
                                      'Load' if src_tgt_map[k]['table_type'] == 'source' else 'Store',
                                      0, 0, int(row['wh_etl_exec_id']) ))


              srl = srl + 1
            self.aw_con.commit()
            self.logger.debug('Completed writing metadata for: %s' % row['gzipped_file_name'])
          else:
            self.logger.error("Fails to get Pig script file name used in GW shell script: %s" % m.group(1))
Example #53
0
  def run(self):
    sql = """
        SELECT a.application_id, a.job_id, a.attempt_number, a.script_path, a.script_type, a.script_name,
			fl.flow_name as chain_name, fj.job_name,
			CASE WHEN e.dir_short_path IS NOT NULL THEN CASE
			WHEN LOCATE('/scripts',a.script_path) > 0 THEN CONCAT('https://gitli.corp.linkedin.com',
			CONCAT(CONCAT(REPLACE(e.dir_short_path,'/code_base_','/source/code_base_'),
			SUBSTR(a.script_path,LOCATE('/scripts',a.script_path)) ),
			CONCAT('/',a.script_name) )) WHEN LOCATE('/sql',a.script_path) > 0 THEN
			CONCAT('https://gitli.corp.linkedin.com',
			CONCAT(CONCAT(REPLACE(e.dir_short_path,'/code_base_','/source/code_base_'),
			SUBSTR(a.script_path,LOCATE('/sql',a.script_path))),
			CONCAT('/',a.script_name))) END ELSE a.script_name END script_url
			FROM job_attempt_source_code a
			JOIN flow fl ON fl.app_id = a.application_id
			JOIN flow_job fj ON a.application_id = fj.app_id and fl.flow_id = fj.flow_id and a.job_id = fj.job_id
			LEFT OUTER JOIN cfg_file_system_dir_map d
			ON (d.directory_path =
			CASE WHEN LOCATE('/scripts',a.script_path) > 0 THEN
			SUBSTR(a.script_path,1,LOCATE('/scripts',a.script_path) - 1)
			WHEN LOCATE('/sql',a.script_path) > 0 THEN
			SUBSTR(a.script_path,1,LOCATE('/sql',a.script_path) - 1) END)
			LEFT OUTER JOIN cfg_scm_repository_dir e ON (d.map_to_object_id = e.scm_repo_dir_id)
        """

    git_committer = """
        SELECT git_path, commit_time, committer_name, committer_email, ui.user_id
		FROM ( SELECT CONCAT(CONCAT(CONCAT(CONCAT(repository_base_name,'/'),
		CONCAT(module_name,'/source/')),file_path), file_name) as git_path, committer_name, commit_time,
		committer_email FROM source_code_repository_info ) a
        LEFT JOIN dir_external_user_info ui on a.committer_name = ui.full_name
        WHERE a.git_path = '%s' GROUP BY committer_name
        """
    insert_sql = """
        INSERT IGNORE INTO job_execution_script (
        app_id, job_id, script_name, script_path, script_type, chain_name, job_name,
        committer_name, committer_email, committer_ldap, commit_time, script_url)
        SELECT %d, %d, '%s', '%s', '%s', '%s', '%s',
         '%s', '%s', '%s', '%s', '%s'
         FROM dual
         """
    try:
      self.logger.info('script collect')
      if self.last_execution_unix_time:
        sql += ' WHERE a.created_date >= from_unixtime(%d) - INTERVAL 1 DAY '% long(self.last_execution_unix_time)
      else:
        sql += ' WHERE a.created_date >= CURRENT_DATE - INTERVAL %d DAY '% long(self.look_back_days)
      self.logger.info(sql)
      self.wh_cursor.execute(sql)
      rows = DbUtil.copy_dict_cursor(self.wh_cursor)
      for row in rows:
        git_url = row['script_url'].replace('https://', 'git://')
        self.wh_cursor.execute(git_committer % git_url)
        git_rows = DbUtil.copy_dict_cursor(self.wh_cursor)
        if git_rows and len(git_rows) > 0:
          for git_row in git_rows:
            self.wh_cursor.execute(insert_sql %
                                 (int(row['application_id']), int(row['job_id']), row['script_name'],
                                  row['script_path'], row['script_type'], row['chain_name'], row['job_name'],
                                  git_row['committer_name'], git_row['committer_email'],
                                  git_row['user_id'] if git_row['user_id'] else git_row['committer_name'],
                                  git_row['commit_time'], row['script_url'] ))

        else:
          self.logger.info("git rows size is 0")
          self.logger.info(row['script_name'])
          self.wh_cursor.execute(insert_sql %
                               (int(row['application_id']), int(row['job_id']), row['script_name'],
                                row['script_path'], row['script_type'], row['chain_name'], row['job_name'],
                                "", "", "",
                                "", row['script_url'] ))
        self.wh_con.commit()
    finally:
      self.wh_cursor.close()
      self.wh_con.close()
Example #54
0
  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]"
                     % (self.last_execution_unix_time, self.lookback_period))
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    if self.last_execution_unix_time:
        time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long(self.last_execution_unix_time)
    else:
        time_filter = "SYSDATE - %d" % int(self.lookback_period)
    flow_query = \
        """SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID
           FROM SO_JOB_TABLE J JOIN (
           SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID
           FROM
           ( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY
             WHERE SO_JOB_FINISHED >= %s
               AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE
             WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
               AND SO_CHILD_COUNT > 0
           )
           GROUP BY SO_JOB_SEQ
           ) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ
           WHERE SO_COMMAND_TYPE = 'CHAIN'
           ORDER BY 2,3
        """ % time_filter
    job_query = \
        """SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID,
            t.* FROM SO_CHAIN_DETAIL d
            JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ
            WHERE d.SO_CHAIN_SEQ = %d
            ORDER BY d.SO_CHAIN_ORDER
        """
    self.aw_cursor.execute(flow_query)
    rows = DbUtil.dict_cursor(self.aw_cursor)
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    row_count = 0

    for row in rows:

      flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']

      flow_record = AppworxFlowRecord(self.app_id,
                                      long(row['SO_JOB_SEQ']),
                                      row['SO_MODULE'],
                                      row['SO_APPLICATION'],
                                      flow_path,
                                      0,
                                      0,
                                      0,
                                      'Y',
                                      self.wh_exec_id)
      flow_writer.append(flow_record)
      new_appworx_cursor = self.aw_con.cursor()
      new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ'])
      job_rows = DbUtil.dict_cursor(new_appworx_cursor)
      for job in job_rows:
        job_record = AppworxJobRecord(self.app_id,
                                      long(row['SO_JOB_SEQ']),
                                      flow_path,
                                      0,
                                      long(job['JOB_ID']),
                                      job['SO_TASK_NAME'],
                                      flow_path + '/' + job['SO_TASK_NAME'],
                                      job['SO_MODULE'],
                                      'Y',
                                      self.wh_exec_id)
        command_type = job['SO_COMMAND_TYPE']
        if command_type and command_type == 'CHAIN':
          job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" + job['SO_MODULE'])
          job_record.setJobType('CHAIN')

        job_writer.append(job_record)

        predecessors_str = job['PREDECESSORS']
        if predecessors_str:
          predecessors = re.findall(r"\&\/(.+?)\s\=\sS", predecessors_str)
          if predecessors:
            for predecessor in predecessors:
              dag_edge = AppworxFlowDagRecord(self.app_id,
                                             long(row['SO_JOB_SEQ']),
                                             flow_path,
                                             0,
                                             flow_path + '/' + predecessor,
                                             flow_path + '/' + job['SO_TASK_NAME'],
                                             self.wh_exec_id)
              dag_writer.append(dag_edge)
      row_count += 1

      if row_count % 1000 == 0:
        flow_writer.flush()
        job_writer.flush()
        dag_writer.flush()

    flow_writer.close()
    job_writer.close()
    dag_writer.close()
Example #55
0
def index():
    return render_template("index.html", title_list=DbUtil.load_blog_titles())
Example #56
0
  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info("collect flow&job executions [last_execution_unix_time=%s lookback_period=%s]"
                     % (self.last_execution_unix_time, self.lookback_period))
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    flow_id_list = []
    if self.last_execution_unix_time:
      flow_cmd = \
        """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % long(self.last_execution_unix_time)
    else:
      flow_cmd = \
        """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= SYSDATE - %d
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % int(self.lookback_period)

    ''' SO_CHAIN_ID = :flow_exec_id will find all job executions under the top level flow

        select SO_EXECUTE_ORDER, SO_JOBID, SO_PARENTS_JOBID, SO_DIRECT_PARENT_JOBID, SO_CHAIN_ID
        from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID
    '''
    if self.last_execution_unix_time:
      job_cmd = \
        """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and
           H.SO_CHAIN_ID = %d"""
    else:
      job_cmd = \
        """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and
           H.SO_CHAIN_ID = %d"""

    try:
      self.aw_cursor.execute(flow_cmd)
    except Exception as e:
      self.logger.error(e + "\n" + flow_cmd)

    rows = DbUtil.dict_cursor(self.aw_cursor)
    row_count = 0
    for row in rows:
      flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']
      so_flow_id = row['SO_JOBID']
      flow_attempt = 0
      flow_exec_id = 0
      try:
        flow_attempt = int(float(str(so_flow_id - int(so_flow_id))[1:])*100)
        flow_exec_id = int(so_flow_id)
      except Exception as e:
        self.logger.error(e)
      self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id)

      flow_exec_record = AppworxFlowExecRecord(self.app_id,
                                               long(row['SO_JOB_SEQ']),
                                               row['SO_MODULE'],
                                               flow_path,
                                               0,
                                               flow_exec_id,
                                               row['SO_STATUS_NAME'],
                                               flow_attempt,
                                               row['SO_USER_NAME'] if row['SO_USER_NAME'] else '',
                                               long(row['JOB_STARTED']),
                                               long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0),
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)

      new_appworx_cursor = self.aw_con.cursor()
      if self.last_execution_unix_time:
        new_appworx_cursor.execute(job_cmd % (long(self.last_execution_unix_time), flow_exec_id))
      else:
        new_appworx_cursor.execute(job_cmd % (int(self.lookback_period), flow_exec_id))
      job_rows = DbUtil.dict_cursor(new_appworx_cursor)

      for job in job_rows:
        so_job_id = job['SO_JOBID']
        job_attempt = 0
        job_exec_id = 0
        try:
          job_attempt = int(float(str(so_job_id - int(so_job_id))[1:])*100)
          job_exec_id = int(so_job_id)
        except Exception as e:
          self.logger.error(e)

        job_exec_record = AppworxJobExecRecord(self.app_id,
                                               long(row['SO_JOB_SEQ']),
                                               flow_path,
                                               0,
                                               flow_exec_id,
                                               long(job['JOB_ID']),
                                               job['SO_TASK_NAME'],
                                               flow_path + "/" + job['SO_TASK_NAME'],
                                               job_exec_id,
                                               job['SO_STATUS_NAME'],
                                               job_attempt,
                                               long(job['JOB_STARTED']),
                                               long(job['JOB_FINISHED']),
                                               self.wh_exec_id)

        job_exec_writer.append(job_exec_record)
        row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()

    flow_exec_writer.close()
    job_exec_writer.close()
  def process_li_bteq(self):
    self.logger.info("process li bteq")
    bteq_rows = self.get_log_file_name(module_name='LI_BTEQ')
    kfk_rows = self.get_log_file_name(module_name='KFK_SONORA_STG_TO_FACT')
    rows = bteq_rows + kfk_rows

    parameter_query = \
        """
        SELECT param_value
        FROM   job_parameter
        WHERE  app_id = %d
        AND    job_exec_id = %d
        AND    attempt_number = %d
        ORDER BY param_no
        """
    check_parsed_source_code_query = \
        """
        SELECT application_id, job_id, attempt_number
        FROM   job_attempt_source_code
        WHERE  script_name = '%s'
        AND    script_path = '%s'
        AND    script_md5_sum = CASE WHEN '%s' = 'None' THEN NULL ELSE UNHEX('%s') END
        AND    application_id = %d
        ORDER BY job_id DESC
        LIMIT 1
        """
    check_parsed_lineage_query = \
        """
        SELECT *
        FROM   stg_job_execution_data_lineage
        WHERE  app_id = %d
        AND    job_exec_id = %d
        """
    update_staging_lineage_query = \
        """
        INSERT IGNORE INTO stg_job_execution_data_lineage (
          app_id, flow_exec_id, job_exec_id, flow_path, job_name, job_start_unixtime, job_finished_unixtime,
          db_id, abstracted_object_name, full_object_name, partition_start, partition_end, partition_type,
          storage_type, source_target_type, srl_no, source_srl_no, operation, record_count, insert_count,
          created_date, wh_etl_exec_id)
          SELECT %d, %d, %d, '%s', '%s', %d, %d, %d, '%s', '%s',
          CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
          CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
          CASE WHEN '%s' = 'None' THEN NULL ELSE '%s' END,
          '%s', '%s', %d,
          CASE WHEN %d = 0 THEN NULL ELSE %d END,
          '%s', %d, %d, UNIX_TIMESTAMP(now()), %d
        FROM dual
        """
    for row in rows:
      try:
        self.logger.info('Parsing log file: %s' % row['gzipped_file_name'])
        self.aw_cursor.execute(parameter_query %
                             (int(row['app_id']), int(row['job_exec_id']), int(row['attempt_id'])))
        params = DbUtil.copy_dict_cursor(self.aw_cursor)
        key_values = {}
        for param in params:
          ParseUtil.value_from_cmd_line(param['param_value'], key_values)
        analyzed_script = False
        results = AppworxLogParser(log_file_name = row['gzipped_file_name']).parse_log({}, command_type='LI_BTEQ')
        if any(results) == 0:
          self.logger.info('Skipped parsing %s' % row['gzipped_file_name'])
          continue
        self.logger.info(str(results))
        self.logger.info('Completed parsing log file: %s' % row['gzipped_file_name'])

        # Compare md5 string of the file with the one parsed last time
        md5_str = hashlib.md5()
        try:
          md5_str.update(open(self.local_script_path +
                            results['script_path'] + '/' +
                            results['script_name']).read())
        except IOError:
          self.logger.warn("Fails to find script file: %s/%s. Skipping the file" % (results['script_path'], results['script_name']))
          continue

        self.aw_cursor.execute(check_parsed_source_code_query %
                             (results['script_name'],
                              results['script_path'],
                              md5_str,
                              md5_str.hexdigest(),
                              int(row['app_id'])))
        parsed_scripts = DbUtil.copy_dict_cursor(self.aw_cursor)
        if len(parsed_scripts) != 0:
          self.aw_cursor.execute(check_parsed_lineage_query %
                               (int(row['app_id']),
                                int(row['job_exec_id'])))

          parsed_lineage = DbUtil.copy_dict_cursor(self.aw_cursor)
          if len(parsed_lineage) == 0 or self.skip_already_parsed == 'N':
            analyzed_script = False
          else:
            self.logger.debug("%s/%s has already been analyzed. Skipping..." %
                            (results['script_path'], results['script_name']))
            analyzed_script = True
        self.clean_up_staging_lineage_dbs(row['app_id'], row['job_id'], row['job_exec_id'], row['attempt_id'])
        update_source_code_query = \
          """
          INSERT INTO job_attempt_source_code(
            application_id, job_id, attempt_number, script_name, script_path, script_type, created_date, script_md5_sum)
          VALUES( %d, %d, %d, '%s', '%s', '%s', now(), CASE WHEN '%s' = 'None' THEN NULL ELSE UNHEX('%s') END)
          """ % (int(row['app_id']), int(row['job_id']), int(row['attempt_id']),
                 results['script_name'], results['script_path'],
                 'SQL', md5_str, md5_str.hexdigest())
        self.aw_cursor.execute(update_source_code_query )
        self.aw_con.commit()

        db_id = 0
        try:
          db_id = self.db_lookup(results['host'])
        except KeyError:
          self.logger.error(sys.exc_info()[0])
        bteq_load_srl = 1
        if not analyzed_script:
          if 'table' in results:
            schema_name = ''
            if 'full_path' in results['table'][0] and results['table'][0]['full_path']:
              index = results['table'][0]['full_path'].index('.')
              if index != -1:
                schema_name = results['table'][0]['full_path'][0:index]
            elif 'schema_name' in results['table'][0] and results['table'][0]['schema_name']:
              schema_name = results['table'][0]['schema_name']
            self.aw_cursor.execute(update_staging_lineage_query %
                                 (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']),
                                  row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']), 0,
                                  ('/' + schema_name + '/' + results['table'][0]['abstracted_path']) \
                                      if schema_name else results['table'][0]['abstracted_path'], results['table'][0]['full_path'],
                                  None, None, None, None, None, None, results['table'][0]['storage_type'],
                                  results['table'][0]['table_type'], bteq_load_srl, 0, 0,
                                  'Read',
                                  0, 0, int(row['wh_etl_exec_id']) ))
            bteq_load_srl = bteq_load_srl + 1
            schema_name = ''
            if 'full_path' in results['table'][1] and results['table'][1]['full_path']:
              full_table_name = results['table'][1]['full_path']
              index = full_table_name.index('.')
              if index != -1:
                schema_name = full_table_name[0:index]
            elif 'schema_name' in results['table'][1] and results['table'][1]['schema_name']:
              full_table_name = results['table'][1]['schema_name'] + '.' + results['table'][1]['table_name']
              schema_name = results['table'][1]['schema_name']
            else:
              full_table_name = results['table'][1]['table_name']
            self.aw_cursor.execute(update_staging_lineage_query %
                                 (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']),
                                  row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']), db_id,
                                  ('/' + schema_name + '/' + results['table'][1]['table_name']) \
                                    if schema_name else results['table'][1]['table_name'],
                                  full_table_name, None, None, None, None, None, None, 'Teradata',
                                  results['table'][0]['table_type'], bteq_load_srl, 0, 0,
                                  'Load',
                                  0, 0, int(row['wh_etl_exec_id']) ))
            self.aw_con.commit()
          self.logger.info("Parsing script: %s/%s" % (results['script_path'], results['script_name']))


          entries = BteqLogParser().parse(
              key_values,
              self.local_script_path +
              results['script_path'] + '/' +
              results['script_name'],
              results['script_path'],
              results['script_name'], self.bteq_source_target_override, self.metric_override
            )
          metric_idx = 1
          for srl, e in enumerate(entries):
            schema_name, table_name = ParseUtil.get_db_table_abstraction(e['relation'])
            full_table_name = schema_name + '.' + table_name
            self.aw_cursor.execute(update_staging_lineage_query %
                                   (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']),
                                    row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']),
                                    db_id, '/' + schema_name + '/' + table_name, full_table_name,
                                    None, None, None, None, None, None, e['storage_type'],
                                    e['table_type'], (bteq_load_srl + srl + 1), 0, 0,
                                    e['operation'],
                                    0, 0, int(row['wh_etl_exec_id']) ))
          self.aw_con.commit()
        else:
          for p in parsed_lineage:
            p['database_id'] = db_id
            self.aw_cursor.execute(update_staging_lineage_query %
                                 (int(row['app_id']), int(row['flow_exec_id']), int(row['job_exec_id']),
                                  row['flow_path'], row['job_name'], int(row['start_time']), int(row['end_time']),
                                  p['database_id'], p['abstracted_object_name'], p['full_object_name'],
                                  p['partition_start'], p['partition_start'], p['partition_end'], p['partition_end'],
                                  p['partition_type'], p['partition_type'], p['storage_type'],
                                  p['source_target_type'], p['srl_no'], 0, 0,
                                  p['operation'],
                                  0, 0, int(row['wh_etl_exec_id']) ))
          self.aw_con.commit()
        self.logger.info('Completed processing metadata for log file: %s' % row['gzipped_file_name'])
      except:
        self.logger.error(str(sys.exc_info()[0]))
Example #58
0
def blogpage(title):
    blog = DbUtil.load_blog(title)
    return render_template("blog.html", blog=blog)
Example #59
0
  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs")
    query = "SELECT distinct f.*, p.name as project_name FROM  project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
    self.az_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.az_cursor)
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    row_count = 0

    for row in rows:
      row['version'] = 0 if (row["version"] is None) else row["version"]

      json_column = 'json'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except:
        pass

      flow_path = row['project_name'] + ":" + row['flow_id']

      flow_record = AzkabanFlowRecord(self.app_id,
                                      row['flow_id'],
                                      row['project_name'],
                                      flow_path,
                                      0,
                                      row['modified_time'] / 1000,
                                      row["version"],
                                      'Y',
                                      self.wh_exec_id)
      flow_writer.append(flow_record)

      # get flow jobs
      nodes = row[json_column]['nodes']
      for node in nodes:
        job_record = AzkabanJobRecord(self.app_id,
                                      flow_path,
                                      row["version"],
                                      node['id'],
                                      flow_path + '/' + node['id'],
                                      node['jobType'],
                                      'Y',
                                      self.wh_exec_id)
        if node['jobType'] == 'flow':
          job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId'])
        job_writer.append(job_record)

      # job dag
      edges = row[json_column]['edges']
      for edge in edges:
        dag_edge = AzkabanFlowDagRecord(self.app_id,
                                        flow_path,
                                        row['version'],
                                        flow_path + '/' + edge['source'],
                                        flow_path + '/' + edge['target'],
                                        self.wh_exec_id)
        dag_writer.append(dag_edge)

      row_count += 1

      if row_count % 1000 == 0:
        flow_writer.flush()
        job_writer.flush()
        dag_writer.flush()

    flow_writer.close()
    job_writer.close()
    dag_writer.close()