Python FileWriter.flush Exemples, wherehows.common.writers.FileWriter.flush Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : AzkabanExtract.py Projet : alyiwang/WhereHows

  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info( "collect flow&job executions")
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)

    cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
    self.az_cursor.execute(cmd)
    rows = DbUtil.dict_cursor(self.az_cursor)
    row_count = 0
    for row in rows:
      json_column = 'flow_data'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except Exception as e:
        self.logger.error(e)
        pass
      flow_data = row[json_column]
      flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
      flow_exec_record = AzkabanFlowExecRecord(self.app_id,
                                               flow_data['flowId'],
                                               flow_path,
                                               row['version'],
                                               row['exec_id'],
                                               flow_data['status'],
                                               flow_data['attempt'],
                                               row['submit_user'],
                                               long(row['start_time']) / 1000,
                                               long(row['end_time']) / 1000,
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)
      nodes = flow_data['nodes']
      job_exec_records = []
      for node in nodes:
        job_exec_record = AzkabanJobExecRecord(self.app_id,
                                                flow_path,
                                                row['version'],
                                                row['exec_id'],
                                                node['id'],
                                                flow_path + "/" + node['id'],
                                                None,
                                                node['status'],
                                                node['attempt'],
                                                long(node['startTime']) / 1000,
                                                long(node['endTime']) / 1000,
                                                self.wh_exec_id)
        job_exec_records.append(job_exec_record)

      AzkabanJobExecUtil.sortAndSet(job_exec_records)
      for r in job_exec_records:
        job_exec_writer.append(r)

      row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()
    flow_exec_writer.close()
    job_exec_writer.close()

Exemple #2

0

Afficher le fichier

  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info( "collect flow&job executions")
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)

    cmd = """select * from execution_flows where end_time > UNIX_TIMESTAMP(now() - INTERVAL %d MINUTE) * 1000 """ % (int(look_back_period))
    self.az_cursor.execute(cmd)
    rows = DbUtil.dict_cursor(self.az_cursor)
    row_count = 0
    for row in rows:
      json_column = 'flow_data'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except Exception as e:
        self.logger.error(e)
        pass
      flow_data = row[json_column]
      flow_path = flow_data['projectName'] + ":" + flow_data['flowId']
      flow_exec_record = AzkabanFlowExecRecord(self.app_id,
                                               flow_data['flowId'],
                                               flow_path,
                                               row['version'],
                                               row['exec_id'],
                                               flow_data['status'],
                                               flow_data['attempt'],
                                               row['submit_user'],
                                               long(row['start_time']) / 1000,
                                               long(row['end_time']) / 1000,
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)
      nodes = flow_data['nodes']
      job_exec_records = []
      for node in nodes:
        job_exec_record = AzkabanJobExecRecord(self.app_id,
                                                flow_path,
                                                row['version'],
                                                row['exec_id'],
                                                node['id'],
                                                flow_path + "/" + node['id'],
                                                None,
                                                node['status'],
                                                node['attempt'],
                                                long(node['startTime']) / 1000,
                                                long(node['endTime']) / 1000,
                                                self.wh_exec_id)
        job_exec_records.append(job_exec_record)

      AzkabanJobExecUtil.sortAndSet(job_exec_records)
      for r in job_exec_records:
        job_exec_writer.append(r)

      row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()
    flow_exec_writer.close()
    job_exec_writer.close()

Exemple #3

0

Afficher le fichier

    def collect_flow_execs(self, flow_exec_file, job_exec_file,
                           look_back_period):
        self.logger.info("collect flow&job executions")
        flow_exec_writer = FileWriter(flow_exec_file)
        job_exec_writer = FileWriter(job_exec_file)

        cmd = "SELECT * FROM workflow_info WHERE status is NULL"
        self.lz_cursor.execute(cmd)
        # rows = DbUtil.dict_cursor(self.lz_cursor)
        rows = DbUtil.copy_dict_cursor(self.lz_cursor)
        row_count = 0
        for row in rows:
            flow_path = row['project_name'] + ":" + row['workflow_name']
            flow_exec_record = LhotseFlowExecRecord(
                self.app_id, row["workflow_name"], flow_path, 0, 1,
                "SUCCEEDED", 1, row['owner'],
                long(time.mktime(row['create_time'].timetuple())),
                long(time.mktime(row['modify_time'].timetuple())),
                self.wh_exec_id)
            flow_exec_writer.append(flow_exec_record)

            job_exec_records = []
            task_query = "SELECT * FROM task_info WHERE workflow_id = \"{0}\"".format(
                row['workflow_id'])
            new_lz_cursor = self.lz_cursor
            new_lz_cursor.execute(task_query)
            task_rows = DbUtil.dict_cursor(new_lz_cursor)
            for task in task_rows:
                if task['real_task_id'] is None:
                    continue
                job_exec_record = LhotseJobExecRecord(
                    self.app_id, flow_path, 0, 1, task['task_name'],
                    flow_path + "/" + task['task_name'],
                    long(task['real_task_id']), 'SUCCEEDED', 1,
                    int(time.mktime(task['create_time'].timetuple())),
                    int(time.mktime(task['modify_time'].timetuple())),
                    self.wh_exec_id)
                job_exec_records.append(job_exec_record)

            ## LhotseJobExecRecord.sortAndSet(job_exec_records)
            for r in job_exec_records:
                job_exec_writer.append(r)

            row_count += 1
            if row_count % 10000 == 0:
                flow_exec_writer.flush()
                job_exec_writer.flush()

        flow_exec_writer.close()
        job_exec_writer.close()

Exemple #4

0

Afficher le fichier

Fichier : AppworxExtract.py Projet : wenhuaOpenx/WhereHows

    def collect_flow_execs(self, flow_exec_file, job_exec_file,
                           look_back_period):
        self.logger.info(
            "collect flow&job executions [last_execution_unix_time=%s lookback_period=%s]"
            % (self.last_execution_unix_time, self.lookback_period))
        flow_exec_writer = FileWriter(flow_exec_file)
        job_exec_writer = FileWriter(job_exec_file)
        timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
        self.aw_cursor.execute(timezone)
        schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
        self.aw_cursor.execute(schema)
        flow_id_list = []
        if self.last_execution_unix_time:
            flow_cmd = \
              """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % long(self.last_execution_unix_time)
        else:
            flow_cmd = \
              """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= SYSDATE - %d
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % int(self.lookback_period)
        ''' SO_CHAIN_ID = :flow_exec_id will find all job executions under the top level flow

        select SO_EXECUTE_ORDER, SO_JOBID, SO_PARENTS_JOBID, SO_DIRECT_PARENT_JOBID, SO_CHAIN_ID
        from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID
    '''
        if self.last_execution_unix_time:
            job_cmd = \
              """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and
           H.SO_CHAIN_ID = %d"""
        else:
            job_cmd = \
              """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and
           H.SO_CHAIN_ID = %d"""

        try:
            self.aw_cursor.execute(flow_cmd)
        except Exception as e:
            self.logger.error(e + "\n" + flow_cmd)

        rows = DbUtil.dict_cursor(self.aw_cursor)
        row_count = 0
        for row in rows:
            flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']
            so_flow_id = row['SO_JOBID']
            flow_attempt = 0
            flow_exec_id = 0
            try:
                flow_attempt = int(
                    float(str(so_flow_id - int(so_flow_id))[1:]) * 100)
                flow_exec_id = int(so_flow_id)
            except Exception as e:
                self.logger.error(e)
            self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id)

            flow_exec_record = AppworxFlowExecRecord(
                self.app_id, long(row['SO_JOB_SEQ']), row['SO_MODULE'],
                flow_path, 0, flow_exec_id, row['SO_STATUS_NAME'],
                flow_attempt,
                row['SO_USER_NAME'] if row['SO_USER_NAME'] else '',
                long(row['JOB_STARTED']),
                long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0),
                self.wh_exec_id)
            flow_exec_writer.append(flow_exec_record)

            new_appworx_cursor = self.aw_con.cursor()
            if self.last_execution_unix_time:
                new_appworx_cursor.execute(
                    job_cmd %
                    (long(self.last_execution_unix_time), flow_exec_id))
            else:
                new_appworx_cursor.execute(
                    job_cmd % (int(self.lookback_period), flow_exec_id))
            job_rows = DbUtil.dict_cursor(new_appworx_cursor)

            for job in job_rows:
                so_job_id = job['SO_JOBID']
                job_attempt = 0
                job_exec_id = 0
                try:
                    job_attempt = int(
                        float(str(so_job_id - int(so_job_id))[1:]) * 100)
                    job_exec_id = int(so_job_id)
                except Exception as e:
                    self.logger.error(e)

                job_exec_record = AppworxJobExecRecord(
                    self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0,
                    flow_exec_id, long(job['JOB_ID']), job['SO_TASK_NAME'],
                    flow_path + "/" + job['SO_TASK_NAME'],
                    job_exec_id, job['SO_STATUS_NAME'], job_attempt,
                    long(job['JOB_STARTED']), long(job['JOB_FINISHED']),
                    self.wh_exec_id)

                job_exec_writer.append(job_exec_record)
                row_count += 1
            if row_count % 10000 == 0:
                flow_exec_writer.flush()
                job_exec_writer.flush()

        flow_exec_writer.close()
        job_exec_writer.close()

Exemple #5

0

Afficher le fichier

Fichier : AppworxExtract.py Projet : wenhuaOpenx/WhereHows

    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info(
            "collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]"
            % (self.last_execution_unix_time, self.lookback_period))
        timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
        self.aw_cursor.execute(timezone)
        schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
        self.aw_cursor.execute(schema)
        if self.last_execution_unix_time:
            time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long(
                self.last_execution_unix_time)
        else:
            time_filter = "SYSDATE - %d" % int(self.lookback_period)
        flow_query = \
            """SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID
           FROM SO_JOB_TABLE J JOIN (
           SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID
           FROM
           ( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY
             WHERE SO_JOB_FINISHED >= %s
               AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE
             WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
               AND SO_CHILD_COUNT > 0
           )
           GROUP BY SO_JOB_SEQ
           ) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ
           WHERE SO_COMMAND_TYPE = 'CHAIN'
           ORDER BY 2,3
        """ % time_filter
        job_query = \
            """SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID,
            t.* FROM SO_CHAIN_DETAIL d
            JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ
            WHERE d.SO_CHAIN_SEQ = %d
            ORDER BY d.SO_CHAIN_ORDER
        """
        self.aw_cursor.execute(flow_query)
        rows = DbUtil.dict_cursor(self.aw_cursor)
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        row_count = 0

        for row in rows:

            flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']

            flow_record = AppworxFlowRecord(self.app_id,
                                            long(row['SO_JOB_SEQ']),
                                            row['SO_MODULE'],
                                            row['SO_APPLICATION'], flow_path,
                                            0, 0, 0, 'Y', self.wh_exec_id)
            flow_writer.append(flow_record)
            new_appworx_cursor = self.aw_con.cursor()
            new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ'])
            job_rows = DbUtil.dict_cursor(new_appworx_cursor)
            for job in job_rows:
                job_record = AppworxJobRecord(
                    self.app_id, long(row['SO_JOB_SEQ']), flow_path, 0,
                    long(job['JOB_ID']), job['SO_TASK_NAME'],
                    flow_path + '/' + job['SO_TASK_NAME'], job['SO_MODULE'],
                    'Y', self.wh_exec_id)
                command_type = job['SO_COMMAND_TYPE']
                if command_type and command_type == 'CHAIN':
                    job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" +
                                              job['SO_MODULE'])
                    job_record.setJobType('CHAIN')

                job_writer.append(job_record)

                predecessors_str = job['PREDECESSORS']
                if predecessors_str:
                    predecessors = re.findall(r"\&\/(.+?)\s\=\sS",
                                              predecessors_str)
                    if predecessors:
                        for predecessor in predecessors:
                            dag_edge = AppworxFlowDagRecord(
                                self.app_id, long(row['SO_JOB_SEQ']),
                                flow_path, 0, flow_path + '/' + predecessor,
                                flow_path + '/' + job['SO_TASK_NAME'],
                                self.wh_exec_id)
                            dag_writer.append(dag_edge)
            row_count += 1

            if row_count % 1000 == 0:
                flow_writer.flush()
                job_writer.flush()
                dag_writer.flush()

        flow_writer.close()
        job_writer.close()
        dag_writer.close()

Exemple #6

0

Afficher le fichier

Fichier : HiveTransform.py Projet : taiwen/WhereHows

    def transform(self, input, hive_metadata, hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        f_json = open(input)
        all_data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                if TableInfo.view_expended_text in prop_json:
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`', '')
                    array = HiveViewDependency.getViewDependency(text)
                    l = []
                    for a in array:
                        l.append(a)
                    prop_json['view_depends_on'] = l

                # process either schema
                flds = {}
                field_detail_list = []
                if TableInfo.schema_literal in table and table[
                        TableInfo.schema_literal] is not None:
                    sort_id = 0
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                    except ValueError:
                        self.logger.error("Schema json error for table : \n" +
                                          str(table))
                    schema_json = schema_data

                    # process each field
                    for field in schema_data['fields']:
                        field_name = field['name']
                        type = field['type']  # could be a list
                        default_value = field[
                            'default'] if 'default' in field else None
                        doc = field['doc'] if 'doc' in field else None

                        attributes_json = json.loads(
                            field['attributes_json']
                        ) if 'attributes_json' in field else None
                        pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = None
                        if attributes_json:
                            pk = attributes_json[
                                'pk'] if 'pk' in attributes_json else None
                            delta = attributes_json[
                                'delta'] if 'delta' in attributes_json else None
                            is_nullable = attributes_json[
                                'nullable'] if 'nullable' in attributes_json else None
                            inside_type = attributes_json[
                                'type'] if 'type' in attributes_json else None
                            format = attributes_json[
                                'format'] if 'format' in attributes_json else None

                        flds[field_name] = {'type': type}
                        # String urn, Integer sortId, Integer parentSortId, String parentPath, String fieldName,
                        #String dataType, String isNullable, String defaultValue, Integer dataSize, String namespace, String description
                        sort_id += 1
                        field_detail_list.append([
                            "hive:///%s/%s" %
                            (one_db_info['database'], table['name']),
                            str(sort_id), '0', None, field_name, '', type,
                            data_size, None, None, is_nullable, is_indexed,
                            is_partitioned, default_value, None,
                            json.dumps(attributes_json)
                        ])
                elif TableInfo.field_list in table:
                    schema_json = {
                        'type': 'record',
                        'name': table['name'],
                        'fields': table[TableInfo.field_list]
                    }  # construct a schema for data came from COLUMN_V2
                    for field in table[TableInfo.field_list]:
                        field_name = field['ColumnName']
                        type = field['TypeName']
                        # ColumnName, IntegerIndex, TypeName, Comment
                        flds[field_name] = {'type': type}
                        pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = default_value = None  # TODO ingest
                        field_detail_list.append([
                            "hive:///%s/%s" %
                            (one_db_info['database'], table['name']),
                            field['IntegerIndex'], '0', None, field_name, '',
                            field['TypeName'], None, None, None, is_nullable,
                            is_indexed, is_partitioned, default_value, None,
                            None
                        ])

                dataset_scehma_record = DatasetSchemaRecord(
                    table['name'], json.dumps(schema_json),
                    json.dumps(prop_json), json.dumps(flds),
                    "hive:///%s/%s" % (one_db_info['database'], table['name']),
                    'Hive', '', (table[TableInfo.create_time] if table.has_key(
                        TableInfo.create_time) else None),
                    (table["lastAlterTime"])
                    if table.has_key("lastAlterTime") else None)
                schema_file_writer.append(dataset_scehma_record)

                for fields in field_detail_list:
                    field_record = DatasetFieldRecord(fields)
                    field_file_writer.append(field_record)

            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        schema_file_writer.close()
        field_file_writer.close()

Exemple #7

0

Afficher le fichier

Fichier : TeradataTransform.py Projet : 0xqq/WhereHows

  def transform(self, input, td_metadata, td_field_metadata):
    '''
    convert from json to csv
    :param input: input json file
    :param td_metadata: output data file for teradata metadata
    :param td_field_metadata: output data file for teradata field metadata
    :return:
    '''
    f_json = open(input)
    data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(td_metadata)
    field_file_writer = FileWriter(td_field_metadata)

    for d in data:
      i = 0
      for k in d.keys():
        if k not in ['tables', 'views']:
          continue
        self.logger.info("%s %4d %s" % (datetime.datetime.now().strftime("%H:%M:%S"), len(d[k]), k))
        for t in d[k]:
          self.logger.info("%4d %s" % (i, t['name']))
          if t['name'] == 'HDFStoTD_2464_ERR_1':
            continue
          i += 1
          output = {}
          prop_json = {}
          output['name'] = t['name']
          output['original_name'] = t['original_name']

          prop_json["createTime"] = t["createTime"] if t.has_key("createTime") else None
          prop_json["lastAlterTime"] = t["lastAlterTime"] if t.has_key("lastAlterTime") else None
          prop_json["lastAccessTime"] = t["lastAccessTime"] if t.has_key("lastAccessTime") else None
          prop_json["accessCount"] = t["accessCount"] if t.has_key("accessCount") else None
          prop_json["sizeInMbytes"] = t["sizeInMbytes"] if t.has_key("sizeInMbytes") else None
          if "type" in t:
            prop_json["storage_type"] = t["type"]
          if "partition" in t:
            prop_json["partition"] = t["partition"]
          if "partitions" in t:
            prop_json["partitions"] = t["partitions"]
          if "hashKey" in t:
            prop_json["hashKey"] = t["hashKey"]
          if "indices" in t:
            prop_json["indices"] = t["indices"]
          if "referenceTables" in t:
            prop_json["referenceTables"] = t["referenceTables"]
          if "viewSqlText" in t:
            prop_json["viewSqlText"] = t["viewSqlText"]

          output['fields'] = []
          flds = {}
          field_detail_list = []
          sort_id = 0
          for c in t['columns']:
            # output['fields'].append(
            #                    { 'name' : t['name'].encode('latin-1'),
            #                      'type' : None if c['data_type'] is None else c['data_type'].encode('latin-1'),
            #                      'attributes_json' : c}
            #                output['fields'][c['name'].encode('latin-1')].append({ "doc" : "", "type" : [None if c['data_type'] is None else c['data_type'].encode('latin-1')]})
            sort_id += 1
            output['fields'].append({"name": c['name'], "doc": '', "type": c['dataType'] if c['dataType'] else None,
                                     "nullable": c['nullable'], "maxByteLength": c['maxByteLength'],
                                     "format": c['columnFormat'] if c.has_key('columnFormat') else None,
                                     "accessCount": c['accessCount'] if c.has_key('accessCount') else None,
                                     "lastAccessTime": c['lastAccessTime'] if c.has_key("lastAccessTime") else None})

            flds[c['name']] = {'type': c['dataType'], "maxByteLength": c['maxByteLength']}

            field_detail_list.append(
              ["teradata:///%s/%s" % (d['database'], output['name']), str(sort_id), '0', '', c['name'], '',
               c['dataType'] if 'dataType' in c and c['dataType'] is not None else '',
               str(c['maxByteLength']) if 'maxByteLength' in c else '0',
               str(c['precision']) if 'precision' in c and c['precision'] is not None else '',
               str(c['scale']) if 'scale' in c and c['scale'] is not None else '',
               c['nullable'] if 'nullable' in c and c['nullable'] is not None else 'Y', '', '', '', '', '', '', ''])

          dataset_scehma_record = DatasetSchemaRecord(output['name'], json.dumps(output), json.dumps(prop_json),
                                                      json.dumps(flds),
                                                      "teradata:///%s/%s" % (d['database'], output['name']), 'Teradata',
                                                      output['original_name'],
                                                      (self.convert_timestamp(t["createTime"]) if t.has_key("createTime") else None),
                                                      (self.convert_timestamp(t["lastAlterTime"]) if t.has_key("lastAlterTime") else None))
          schema_file_writer.append(dataset_scehma_record)

          for fields in field_detail_list:
            field_record = DatasetFieldRecord(fields)
            field_file_writer.append(field_record)

        schema_file_writer.flush()
        field_file_writer.flush()
        self.logger.info("%20s contains %6d %s" % (d['database'], i, k))

    schema_file_writer.close()
    field_file_writer.close()

Exemple #8

0

Afficher le fichier

    def transform(self, input, hive_instance, hive_metadata,
                  hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        all_data = []
        with open(input) as input_file:
            for line in input_file:
                all_data.append(json.loads(line))

        dataset_idx = -1

        instance_file_writer = FileWriter(hive_instance)
        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()
        depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                view_expanded_text = ''

                if TableInfo.view_expended_text in prop_json:
                    view_expanded_text = prop_json[
                        TableInfo.view_expended_text]
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`',
                        '')  # this will be fixed after switching to Hive AST
                    array = []
                    try:
                        array = HiveViewDependency.getViewDependency(text)
                    except:
                        self.logger.error(
                            "HiveViewDependency.getViewDependency(%s) failed!"
                            % (table['name']))

                    l = []
                    for a in array:
                        l.append(a)
                        names = str(a).split('.')
                        if names and len(names) >= 2:
                            db_name = names[0].lower()
                            table_name = names[1].lower()
                            if db_name and table_name:
                                self.curs.execute(
                                    depends_sql.format(db_name=db_name,
                                                       table_name=table_name,
                                                       version='{version}'))
                                rows = self.curs.fetchall()
                                self.conn_hms.commit()
                                if rows and len(rows) > 0:
                                    for row_index, row_value in enumerate(
                                            rows):
                                        dependent_record = HiveDependencyInstanceRecord(
                                            one_db_info['type'], table['type'],
                                            "/%s/%s" %
                                            (one_db_info['database'],
                                             table['name']), 'dalids:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name']
                                            if one_db_info['type'].lower()
                                            == 'dalids' else 'hive:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name'],
                                            'depends on', 'Y', row_value[3],
                                            row_value[4], row_value[2],
                                            row_value[5] + ':///' +
                                            row_value[0] + '/' + row_value[1],
                                            '')
                                        self.instance_writer.append(
                                            dependent_record)
                    prop_json['view_depends_on'] = l
                    self.instance_writer.flush()

                # process either schema
                flds = {}
                field_detail_list = []

                if TableInfo.schema_literal in table and \
                   table[TableInfo.schema_literal] is not None and \
                   table[TableInfo.schema_literal].startswith('{'):
                    sort_id = 0
                    urn = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    self.logger.info("Getting schema literal for: %s" % (urn))
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                        schema_json = schema_data
                        acp = AvroColumnParser(schema_data, urn=urn)
                        result = acp.get_column_list_result()
                        field_detail_list += result
                    except ValueError:
                        self.logger.error(
                            "Schema Literal JSON error for table: " +
                            str(table))

                elif TableInfo.field_list in table:
                    # Convert to avro
                    uri = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    if one_db_info['type'].lower() == 'dalids':
                        uri = "dalids:///%s/%s" % (one_db_info['database'],
                                                   table['dataset_name'])
                    else:
                        uri = "hive:///%s/%s" % (one_db_info['database'],
                                                 table['dataset_name'])
                    self.logger.info("Getting column definition for: %s" %
                                     (uri))
                    try:
                        hcp = HiveColumnParser(table, urn=uri)
                        schema_json = {
                            'fields': hcp.column_type_dict['fields'],
                            'type': 'record',
                            'name': table['name'],
                            'uri': uri
                        }
                        field_detail_list += hcp.column_type_list
                    except:
                        self.logger.error("HiveColumnParser(%s) failed!" %
                                          (uri))
                        schema_json = {
                            'fields': {},
                            'type': 'record',
                            'name': table['name'],
                            'uri': uri
                        }

                if one_db_info['type'].lower() == 'dalids':
                    dataset_urn = "dalids:///%s/%s" % (one_db_info['database'],
                                                       table['dataset_name'])
                else:
                    dataset_urn = "hive:///%s/%s" % (one_db_info['database'],
                                                     table['dataset_name'])

                dataset_instance_record = DatasetInstanceRecord(
                    'dalids:///' + one_db_info['database'] + '/' +
                    table['name'] if one_db_info['type'].lower() == 'dalids'
                    else 'hive:///' + one_db_info['database'] + '/' +
                    table['name'], 'grid', '', '', '*', 0,
                    table['native_name'], table['logical_name'],
                    table['version'], table['create_time'],
                    json.dumps(schema_json), json.dumps(view_expanded_text),
                    dataset_urn)
                instance_file_writer.append(dataset_instance_record)

                if dataset_urn not in self.dataset_dict:
                    dataset_scehma_record = DatasetSchemaRecord(
                        table['dataset_name'], json.dumps(schema_json),
                        json.dumps(prop_json), json.dumps(flds), dataset_urn,
                        'Hive', one_db_info['type'], table['type'], '',
                        table.get(TableInfo.create_time),
                        (int(table.get(TableInfo.source_modified_time, "0"))))
                    schema_file_writer.append(dataset_scehma_record)

                    dataset_idx += 1
                    self.dataset_dict[dataset_urn] = dataset_idx

                    for fields in field_detail_list:
                        field_record = DatasetFieldRecord(fields)
                        field_file_writer.append(field_record)

            instance_file_writer.flush()
            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        instance_file_writer.close()
        schema_file_writer.close()
        field_file_writer.close()

Exemple #9

0

Afficher le fichier

Fichier : HiveTransform.py Projet : shoudi/WhereHows

class HiveTransform:
    dataset_dict = {}

    def __init__(self):
        self.logger = LoggerFactory.getLogger('jython script : ' +
                                              self.__class__.__name__)
        username = args[Constant.HIVE_METASTORE_USERNAME]
        password = args[Constant.HIVE_METASTORE_PASSWORD]
        jdbc_driver = args[Constant.HIVE_METASTORE_JDBC_DRIVER]
        jdbc_url = args[Constant.HIVE_METASTORE_JDBC_URL]
        self.conn_hms = zxJDBC.connect(jdbc_url, username, password,
                                       jdbc_driver)
        self.curs = self.conn_hms.cursor()
        dependency_instance_file = args[Constant.HIVE_DEPENDENCY_CSV_FILE_KEY]
        self.instance_writer = FileWriter(dependency_instance_file)

    def transform(self, input, hive_instance, hive_metadata,
                  hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        f_json = open(input)
        all_data = json.load(f_json)
        f_json.close()

        dataset_idx = -1

        instance_file_writer = FileWriter(hive_instance)
        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()
        depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                view_expanded_text = ''

                if TableInfo.view_expended_text in prop_json:
                    view_expanded_text = prop_json[
                        TableInfo.view_expended_text]
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`', '')
                    array = HiveViewDependency.getViewDependency(text)
                    l = []
                    for a in array:
                        l.append(a)
                        names = str(a).split('.')
                        if names and len(names) >= 2:
                            db_name = names[0].lower()
                            table_name = names[1].lower()
                            if db_name and table_name:
                                rows = []
                                self.curs.execute(
                                    depends_sql.format(db_name=db_name,
                                                       table_name=table_name,
                                                       version='{version}'))
                                rows = self.curs.fetchall()
                                if rows and len(rows) > 0:
                                    for row_index, row_value in enumerate(
                                            rows):
                                        dependent_record = HiveDependencyInstanceRecord(
                                            one_db_info['type'], table['type'],
                                            "/%s/%s" %
                                            (one_db_info['database'],
                                             table['name']), 'dalids:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name']
                                            if one_db_info['type'].lower()
                                            == 'dalids' else 'hive:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name'],
                                            'depends on', 'Y', row_value[3],
                                            row_value[4], row_value[2],
                                            row_value[5] + ':///' +
                                            row_value[0] + '/' + row_value[1],
                                            '')
                                        self.instance_writer.append(
                                            dependent_record)
                    prop_json['view_depends_on'] = l
                    self.instance_writer.flush()

                # process either schema
                flds = {}
                field_detail_list = []

                if TableInfo.schema_literal in table and \
                   table[TableInfo.schema_literal] is not None and \
                   table[TableInfo.schema_literal].startswith('{'):
                    sort_id = 0
                    urn = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    self.logger.info("Getting schema literal for: %s" % (urn))
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                        schema_json = schema_data
                        acp = AvroColumnParser(schema_data, urn=urn)
                        result = acp.get_column_list_result()
                        field_detail_list += result
                    except ValueError:
                        self.logger.error(
                            "Schema Literal JSON error for table: " +
                            str(table))

                elif TableInfo.field_list in table:
                    # Convert to avro
                    uri = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    if one_db_info['type'].lower() == 'dalids':
                        uri = "dalids:///%s/%s" % (one_db_info['database'],
                                                   table['dataset_name'])
                    else:
                        uri = "hive:///%s/%s" % (one_db_info['database'],
                                                 table['dataset_name'])
                    self.logger.info("Getting column definition for: %s" %
                                     (uri))
                    hcp = HiveColumnParser(table, urn=uri)
                    schema_json = {
                        'fields': hcp.column_type_dict['fields'],
                        'type': 'record',
                        'name': table['name'],
                        'uri': uri
                    }
                    field_detail_list += hcp.column_type_list

                if one_db_info['type'].lower() == 'dalids':
                    dataset_urn = "dalids:///%s/%s" % (one_db_info['database'],
                                                       table['dataset_name'])
                else:
                    dataset_urn = "hive:///%s/%s" % (one_db_info['database'],
                                                     table['dataset_name'])

                dataset_instance_record = DatasetInstanceRecord(
                    'dalids:///' + one_db_info['database'] + '/' +
                    table['name'] if one_db_info['type'].lower() == 'dalids'
                    else 'hive:///' + one_db_info['database'] + '/' +
                    table['name'], 'grid', 'eat1', 'eat1-nertz', '*', 0,
                    table['native_name'], table['logical_name'],
                    table['version'], table['create_time'],
                    json.dumps(schema_json), json.dumps(view_expanded_text),
                    dataset_urn)
                instance_file_writer.append(dataset_instance_record)

                if dataset_urn not in self.dataset_dict:
                    dataset_scehma_record = DatasetSchemaRecord(
                        table['dataset_name'], json.dumps(schema_json),
                        json.dumps(prop_json), json.dumps(flds), dataset_urn,
                        'Hive', one_db_info['type'], table['type'], '',
                        (table[TableInfo.create_time]
                         if table.has_key(TableInfo.create_time) else None),
                        (table["lastAlterTime"])
                        if table.has_key("lastAlterTime") else None)
                    schema_file_writer.append(dataset_scehma_record)

                    dataset_idx += 1
                    self.dataset_dict[dataset_urn] = dataset_idx

                    for fields in field_detail_list:
                        field_record = DatasetFieldRecord(fields)
                        field_file_writer.append(field_record)

            instance_file_writer.flush()
            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        instance_file_writer.close()
        schema_file_writer.close()
        field_file_writer.close()

    def convert_timestamp(self, time_string):
        return int(time.mktime(time.strptime(time_string,
                                             "%Y-%m-%d %H:%M:%S")))

Exemple #10

0

Afficher le fichier

Fichier : AzkabanExtract.py Projet : alyiwang/WhereHows

  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs")
    query = "SELECT distinct f.*, p.name as project_name FROM  project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
    self.az_cursor.execute(query)
    rows = DbUtil.dict_cursor(self.az_cursor)
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    row_count = 0

    for row in rows:
      row['version'] = 0 if (row["version"] is None) else row["version"]

      json_column = 'json'
      unzipped_content = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(row[json_column].tostring())).read()
      try:
        row[json_column] = json.loads(unzipped_content)
      except:
        pass

      flow_path = row['project_name'] + ":" + row['flow_id']

      flow_record = AzkabanFlowRecord(self.app_id,
                                      row['flow_id'],
                                      row['project_name'],
                                      flow_path,
                                      0,
                                      row['modified_time'] / 1000,
                                      row["version"],
                                      'Y',
                                      self.wh_exec_id)
      flow_writer.append(flow_record)

      # get flow jobs
      nodes = row[json_column]['nodes']
      for node in nodes:
        job_record = AzkabanJobRecord(self.app_id,
                                      flow_path,
                                      row["version"],
                                      node['id'],
                                      flow_path + '/' + node['id'],
                                      node['jobType'],
                                      'Y',
                                      self.wh_exec_id)
        if node['jobType'] == 'flow':
          job_record.setRefFlowPath(row['project_name'] + ":" + node['embeddedFlowId'])
        job_writer.append(job_record)

      # job dag
      edges = row[json_column]['edges']
      for edge in edges:
        dag_edge = AzkabanFlowDagRecord(self.app_id,
                                        flow_path,
                                        row['version'],
                                        flow_path + '/' + edge['source'],
                                        flow_path + '/' + edge['target'],
                                        self.wh_exec_id)
        dag_writer.append(dag_edge)

      row_count += 1

      if row_count % 1000 == 0:
        flow_writer.flush()
        job_writer.flush()
        dag_writer.flush()

    flow_writer.close()
    job_writer.close()
    dag_writer.close()

Exemple #11

0

Afficher le fichier

Fichier : AppworxExtract.py Projet : alyiwang/WhereHows

  def collect_flow_execs(self, flow_exec_file, job_exec_file, look_back_period):
    self.logger.info("collect flow&job executions [last_execution_unix_time=%s lookback_period=%s]"
                     % (self.last_execution_unix_time, self.lookback_period))
    flow_exec_writer = FileWriter(flow_exec_file)
    job_exec_writer = FileWriter(job_exec_file)
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    flow_id_list = []
    if self.last_execution_unix_time:
      flow_cmd = \
        """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % long(self.last_execution_unix_time)
    else:
      flow_cmd = \
        """SELECT J.SO_JOB_SEQ, J.SO_MODULE, J.SO_APPLICATION, H.SO_STATUS_NAME, H.SO_JOBID, H.SO_CHAIN_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED,
           U.SO_USER_NAME FROM SO_JOB_TABLE J
           JOIN (
             SELECT * FROM SO_JOB_HISTORY WHERE SO_JOB_FINISHED >= SYSDATE - %d
                                            AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT * FROM SO_JOB_QUEUE WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
                                          AND SO_CHILD_COUNT > 0
           ) H ON J.SO_JOB_SEQ = H.SO_JOB_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE J.SO_COMMAND_TYPE = 'CHAIN' """ % int(self.lookback_period)

    ''' SO_CHAIN_ID = :flow_exec_id will find all job executions under the top level flow

        select SO_EXECUTE_ORDER, SO_JOBID, SO_PARENTS_JOBID, SO_DIRECT_PARENT_JOBID, SO_CHAIN_ID
        from so_job_history where SO_JOBID = SO_CHAIN_ID or SO_PARENTS_JOBID <> SO_CHAIN_ID
    '''
    if self.last_execution_unix_time:
      job_cmd = \
        """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE --H.SO_JOB_FINISHED >= DATE '1970-01-01' - interval '8' hour + (%d - 3600) / 86400) and
           H.SO_CHAIN_ID = %d"""
    else:
      job_cmd = \
        """SELECT D.SO_TASK_NAME, U.SO_USER_NAME, H.SO_STATUS_NAME, H.SO_JOBID, D.SO_DET_SEQ as JOB_ID,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_STARTED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_STARTED,
           ROUND((cast((FROM_TZ(CAST(H.SO_JOB_FINISHED as timestamp), 'US/Pacific') at time zone 'GMT') as date) -
           to_date('01-JAN-1970','DD-MON-YYYY'))* (86400)) as JOB_FINISHED
           FROM SO_JOB_HISTORY H
           JOIN SO_CHAIN_DETAIL D ON D.SO_CHAIN_SEQ = H.SO_CHAIN_SEQ AND D.SO_DET_SEQ = H.SO_DET_SEQ
           LEFT JOIN SO_USER_TABLE U ON H.SO_USER_SEQ = U.SO_USER_SEQ
           WHERE H.SO_JOB_FINISHED >= SYSDATE - %d and
           H.SO_CHAIN_ID = %d"""

    try:
      self.aw_cursor.execute(flow_cmd)
    except Exception as e:
      self.logger.error(e + "\n" + flow_cmd)

    rows = DbUtil.dict_cursor(self.aw_cursor)
    row_count = 0
    for row in rows:
      flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']
      so_flow_id = row['SO_JOBID']
      flow_attempt = 0
      flow_exec_id = 0
      try:
        flow_attempt = int(float(str(so_flow_id - int(so_flow_id))[1:])*100)
        flow_exec_id = int(so_flow_id)
      except Exception as e:
        self.logger.error(e)
      self.logger.debug("processing flow_exec_id: %8d" % flow_exec_id)

      flow_exec_record = AppworxFlowExecRecord(self.app_id,
                                               long(row['SO_JOB_SEQ']),
                                               row['SO_MODULE'],
                                               flow_path,
                                               0,
                                               flow_exec_id,
                                               row['SO_STATUS_NAME'],
                                               flow_attempt,
                                               row['SO_USER_NAME'] if row['SO_USER_NAME'] else '',
                                               long(row['JOB_STARTED']),
                                               long(row['JOB_FINISHED'] if row['JOB_FINISHED'] else 0),
                                               self.wh_exec_id)
      flow_exec_writer.append(flow_exec_record)

      new_appworx_cursor = self.aw_con.cursor()
      if self.last_execution_unix_time:
        new_appworx_cursor.execute(job_cmd % (long(self.last_execution_unix_time), flow_exec_id))
      else:
        new_appworx_cursor.execute(job_cmd % (int(self.lookback_period), flow_exec_id))
      job_rows = DbUtil.dict_cursor(new_appworx_cursor)

      for job in job_rows:
        so_job_id = job['SO_JOBID']
        job_attempt = 0
        job_exec_id = 0
        try:
          job_attempt = int(float(str(so_job_id - int(so_job_id))[1:])*100)
          job_exec_id = int(so_job_id)
        except Exception as e:
          self.logger.error(e)

        job_exec_record = AppworxJobExecRecord(self.app_id,
                                               long(row['SO_JOB_SEQ']),
                                               flow_path,
                                               0,
                                               flow_exec_id,
                                               long(job['JOB_ID']),
                                               job['SO_TASK_NAME'],
                                               flow_path + "/" + job['SO_TASK_NAME'],
                                               job_exec_id,
                                               job['SO_STATUS_NAME'],
                                               job_attempt,
                                               long(job['JOB_STARTED']),
                                               long(job['JOB_FINISHED']),
                                               self.wh_exec_id)

        job_exec_writer.append(job_exec_record)
        row_count += 1
      if row_count % 10000 == 0:
        flow_exec_writer.flush()
        job_exec_writer.flush()

    flow_exec_writer.close()
    job_exec_writer.close()

Exemple #12

0

Afficher le fichier

Fichier : HiveTransform.py Projet : CSRedRat/WhereHows

  def transform(self, input, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()
    depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        if TableInfo.view_expended_text in prop_json:
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
            names = str(a).split('.')
            if names and len(names) >= 2:
              db_name = names[0]
              table_name = names[1]
              if db_name and table_name:
                rows = []
                self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
                rows = self.curs.fetchall()
                if rows and len(rows) > 0:
                  for row_index, row_value in enumerate(rows):
                    dependent_record = HiveDependencyInstanceRecord(
                                          one_db_info['type'],
                                          table['type'],
                                          "/%s/%s" % (one_db_info['database'], table['name']),
                                          'dalids:///' + one_db_info['database'] + '/' + table['name']
                                          if one_db_info['type'].lower() == 'dalids'
                                          else 'hive:///' + one_db_info['database'] + '/' + table['name'],
                                          'depends on',
                                          'is used by',
                                          row_value[3],
                                          row_value[4],
                                          row_value[2],
                                          row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
                    self.instance_writer.append(dependent_record)
          prop_json['view_depends_on'] = l
          self.instance_writer.flush()

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema json error for table : \n" + str(table))

        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          if one_db_info['type'].lower() == 'dalids':
            uri = "dalids:///%s/%s" % (one_db_info['database'], table['name'])
          else:
            uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          hcp = HiveColumnParser(table, urn = uri)
          schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
          field_detail_list += hcp.column_type_list

        if one_db_info['type'].lower() == 'dalids':
          dataset_urn = "dalids:///%s/%s" % (one_db_info['database'], table['name'])
        else:
          dataset_urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
        dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    dataset_urn,
                                                    'Hive', one_db_info['type'], table['type'],
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
        schema_file_writer.append(dataset_scehma_record)

        for fields in field_detail_list:
          field_record = DatasetFieldRecord(fields)
          field_file_writer.append(field_record)

      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    schema_file_writer.close()
    field_file_writer.close()

Exemple #13

0

Afficher le fichier

Fichier : AppworxExtract.py Projet : alyiwang/WhereHows

  def collect_flow_jobs(self, flow_file, job_file, dag_file):
    self.logger.info("collect flow&jobs [last_execution_unix_time=%s lookback_period=%s]"
                     % (self.last_execution_unix_time, self.lookback_period))
    timezone = "ALTER SESSION SET TIME_ZONE = 'US/Pacific'"
    self.aw_cursor.execute(timezone)
    schema = "ALTER SESSION SET CURRENT_SCHEMA=APPWORX"
    self.aw_cursor.execute(schema)
    if self.last_execution_unix_time:
        time_filter = "(DATE '1970-01-01' - INTERVAL '8' HOUR) + (%d - 3600) / 86400" % long(self.last_execution_unix_time)
    else:
        time_filter = "SYSDATE - %d" % int(self.lookback_period)
    flow_query = \
        """SELECT J.SO_JOB_SEQ, J.SO_APPLICATION, J.SO_MODULE, R.LAST_CHAIN_ID
           FROM SO_JOB_TABLE J JOIN (
           SELECT SO_JOB_SEQ, MAX(SO_CHAIN_ID) as LAST_CHAIN_ID
           FROM
           ( SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_HISTORY
             WHERE SO_JOB_FINISHED >= %s
               AND SO_CHILD_COUNT > 0
             UNION ALL
             SELECT SO_JOB_SEQ, SO_CHAIN_ID FROM SO_JOB_QUEUE
             WHERE SO_STATUS_NAME IN ('INITIATED', 'RUNNING', 'FINISHED')
               AND SO_CHILD_COUNT > 0
           )
           GROUP BY SO_JOB_SEQ
           ) R ON J.SO_JOB_SEQ = R.SO_JOB_SEQ
           WHERE SO_COMMAND_TYPE = 'CHAIN'
           ORDER BY 2,3
        """ % time_filter
    job_query = \
        """SELECT d.SO_TASK_NAME, d.SO_CHAIN_ORDER, d.SO_PREDECESSORS as PREDECESSORS, d.SO_DET_SEQ as JOB_ID,
            t.* FROM SO_CHAIN_DETAIL d
            JOIN SO_JOB_TABLE t ON d.SO_JOB_SEQ = t.SO_JOB_SEQ
            WHERE d.SO_CHAIN_SEQ = %d
            ORDER BY d.SO_CHAIN_ORDER
        """
    self.aw_cursor.execute(flow_query)
    rows = DbUtil.dict_cursor(self.aw_cursor)
    flow_writer = FileWriter(flow_file)
    job_writer = FileWriter(job_file)
    dag_writer = FileWriter(dag_file)
    row_count = 0

    for row in rows:

      flow_path = row['SO_APPLICATION'] + ":" + row['SO_MODULE']

      flow_record = AppworxFlowRecord(self.app_id,
                                      long(row['SO_JOB_SEQ']),
                                      row['SO_MODULE'],
                                      row['SO_APPLICATION'],
                                      flow_path,
                                      0,
                                      0,
                                      0,
                                      'Y',
                                      self.wh_exec_id)
      flow_writer.append(flow_record)
      new_appworx_cursor = self.aw_con.cursor()
      new_appworx_cursor.execute(job_query % row['SO_JOB_SEQ'])
      job_rows = DbUtil.dict_cursor(new_appworx_cursor)
      for job in job_rows:
        job_record = AppworxJobRecord(self.app_id,
                                      long(row['SO_JOB_SEQ']),
                                      flow_path,
                                      0,
                                      long(job['JOB_ID']),
                                      job['SO_TASK_NAME'],
                                      flow_path + '/' + job['SO_TASK_NAME'],
                                      job['SO_MODULE'],
                                      'Y',
                                      self.wh_exec_id)
        command_type = job['SO_COMMAND_TYPE']
        if command_type and command_type == 'CHAIN':
          job_record.setRefFlowPath(job['SO_APPLICATION'] + ":" + job['SO_MODULE'])
          job_record.setJobType('CHAIN')

        job_writer.append(job_record)

        predecessors_str = job['PREDECESSORS']
        if predecessors_str:
          predecessors = re.findall(r"\&\/(.+?)\s\=\sS", predecessors_str)
          if predecessors:
            for predecessor in predecessors:
              dag_edge = AppworxFlowDagRecord(self.app_id,
                                             long(row['SO_JOB_SEQ']),
                                             flow_path,
                                             0,
                                             flow_path + '/' + predecessor,
                                             flow_path + '/' + job['SO_TASK_NAME'],
                                             self.wh_exec_id)
              dag_writer.append(dag_edge)
      row_count += 1

      if row_count % 1000 == 0:
        flow_writer.flush()
        job_writer.flush()
        dag_writer.flush()

    flow_writer.close()
    job_writer.close()
    dag_writer.close()

Exemple #14

0

Afficher le fichier

Fichier : HiveTransform.py Projet : 0xqq/WhereHows

  def transform(self, input, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        if TableInfo.view_expended_text in prop_json:
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
          prop_json['view_depends_on'] = l

        # process either schema
        flds = {}
        field_detail_list = []
        if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
          sort_id = 0
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
          except ValueError:
            self.logger.error("Schema json error for table : \n" + str(table))
          schema_json = schema_data

          # process each field
          for field in schema_data['fields']:
            field_name = field['name']
            type = field['type']  # could be a list
            default_value = field['default'] if 'default' in field else None
            doc = field['doc'] if 'doc' in field else None

            attributes_json = json.loads(field['attributes_json']) if 'attributes_json' in field else None
            pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = None
            if attributes_json:
              pk = attributes_json['pk'] if 'pk' in attributes_json else None
              delta = attributes_json['delta'] if 'delta' in attributes_json else None
              is_nullable = attributes_json['nullable'] if 'nullable' in attributes_json else None
              inside_type = attributes_json['type'] if 'type' in attributes_json else None
              format = attributes_json['format'] if 'format' in attributes_json else None

            flds[field_name] = {'type': type}
            # String urn, Integer sortId, Integer parentSortId, String parentPath, String fieldName,
            #String dataType, String isNullable, String defaultValue, Integer dataSize, String namespace, String description
            sort_id += 1
            field_detail_list.append(
              ["hive:///%s/%s" % (one_db_info['database'], table['name']), str(sort_id), '0', None, field_name, '',
               type, data_size, None, None, is_nullable, is_indexed, is_partitioned, default_value, None,
               json.dumps(attributes_json)])
        elif TableInfo.field_list in table:
          schema_json = {'type': 'record', 'name': table['name'],
                         'fields': table[TableInfo.field_list]}  # construct a schema for data came from COLUMN_V2
          for field in table[TableInfo.field_list]:
            field_name = field['ColumnName']
            type = field['TypeName']
            # ColumnName, IntegerIndex, TypeName, Comment
            flds[field_name] = {'type': type}
            pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = default_value = None  # TODO ingest
            field_detail_list.append(
              ["hive:///%s/%s" % (one_db_info['database'], table['name']), field['IntegerIndex'], '0', None, field_name,
               '', field['TypeName'], None, None, None, is_nullable, is_indexed, is_partitioned, default_value, None,
               None])

        dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    "hive:///%s/%s" % (one_db_info['database'], table['name']), 'Hive',
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
        schema_file_writer.append(dataset_scehma_record)

        for fields in field_detail_list:
          field_record = DatasetFieldRecord(fields)
          field_file_writer.append(field_record)

      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    schema_file_writer.close()
    field_file_writer.close()

Exemple #15

0

Afficher le fichier

    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info("collect flow&jobs")
        query = "SELECT * FROM workflow_info WHERE status is NULL"
        self.lz_cursor.execute(query)
        ## rows = DbUtil.dict_cursor(self.lz_cursor)
        rows = DbUtil.copy_dict_cursor(self.lz_cursor)
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        row_count = 0

        for row in rows:
            self.logger.info("collect flow %d!" % row_count)
            flow_path = row['project_name'] + ":" + row['workflow_name']

            flow_record = LhotseFlowRecord(
                self.app_id, row['workflow_name'], row['project_name'],
                flow_path, 0, int(time.mktime(row['create_time'].timetuple())),
                int(time.mktime(row['modify_time'].timetuple())), 0, 'Y',
                self.wh_exec_id)
            ## for debug
            ## self.logger.info("the flow record is: %s" % flow_record.toCsvString())
            flow_writer.append(flow_record)

            # get relative task of this workflow.
            task_query = "SELECT * FROM task_info WHERE workflow_id = \"{0}\"".format(
                row['workflow_id'])
            new_lz_cursor = self.lz_cursor
            new_lz_cursor.execute(task_query)
            task_rows = DbUtil.dict_cursor(new_lz_cursor)

            for task in task_rows:
                job_record = LhotseJobRecord(
                    self.app_id, flow_path, 0, task['task_name'],
                    flow_path + '/' + task['task_name'],
                    task['task_type_name'], 'Y', self.wh_exec_id)
                job_writer.append(job_record)

            # task bridge
            # bridge's status need to be considered in the next stage
            task_bridge_query = "SELECT * FROM task_bridge WHERE workflow_id = \"{0}\"".format(
                row['workflow_id'])
            self.lz_cursor.execute(task_bridge_query)
            # task_bridge_rows = DbUtil.dict_cursor(self.lz_cursor)
            task_bridge_rows = DbUtil.copy_dict_cursor(self.lz_cursor)

            for bridge in task_bridge_rows:
                origin_task_query = "SELECT task_name FROM task_info WHERE task_id = \"{0}\"".format(
                    bridge['origin_id'])
                self.lz_cursor.execute(origin_task_query)
                origin_tasks = self.lz_cursor.fetchone()

                target_task_query = "SELECT task_name FROM task_info WHERE task_id = \"{0}\"".format(
                    bridge['target_id'])
                self.lz_cursor.execute(target_task_query)
                target_tasks = self.lz_cursor.fetchone()

                dag_edge = LhotseFlowDagRecord(
                    self.app_id, flow_path, 0,
                    flow_path + '/' + origin_tasks[0],
                    flow_path + '/' + target_tasks[0], self.wh_exec_id)
                dag_writer.append(dag_edge)

            row_count += 1

            if row_count % 1000 == 0:
                flow_writer.flush()
                job_writer.flush()
                dag_writer.flush()

        flow_writer.close()
        job_writer.close()
        dag_writer.close()

Exemple #16

0

Afficher le fichier

Fichier : HiveTransform.py Projet : mt0803/WhereHows

    def transform(self, input, hive_metadata, hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        f_json = open(input)
        all_data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                if TableInfo.view_expended_text in prop_json:
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`', '')
                    array = HiveViewDependency.getViewDependency(text)
                    l = []
                    for a in array:
                        l.append(a)
                    prop_json['view_depends_on'] = l

                # process either schema
                flds = {}
                field_detail_list = []

                if TableInfo.schema_literal in table and table[
                        TableInfo.schema_literal] is not None:
                    sort_id = 0
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                    except ValueError:
                        self.logger.error("Schema json error for table : \n" +
                                          str(table))
                    schema_json = schema_data
                    # extract fields to field record
                    urn = "hive:///%s/%s" % (one_db_info['database'],
                                             table['name'])
                    acp = AvroColumnParser(schema_data, urn=urn)
                    result = acp.get_column_list_result()
                    field_detail_list += result

                elif TableInfo.field_list in table:
                    # Convert to avro
                    uri = "hive:///%s/%s" % (one_db_info['database'],
                                             table['name'])
                    hcp = HiveColumnParser(table, urn=uri)
                    schema_json = {
                        'fields': hcp.column_type_dict['fields'],
                        'type': 'record',
                        'name': table['name'],
                        'uri': uri
                    }
                    field_detail_list += hcp.column_type_list

                dataset_scehma_record = DatasetSchemaRecord(
                    table['name'], json.dumps(schema_json),
                    json.dumps(prop_json), json.dumps(flds),
                    "hive:///%s/%s" % (one_db_info['database'], table['name']),
                    'Hive', '', (table[TableInfo.create_time] if table.has_key(
                        TableInfo.create_time) else None),
                    (table["lastAlterTime"])
                    if table.has_key("lastAlterTime") else None)
                schema_file_writer.append(dataset_scehma_record)

                for fields in field_detail_list:
                    field_record = DatasetFieldRecord(fields)
                    field_file_writer.append(field_record)

            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        schema_file_writer.close()
        field_file_writer.close()

Exemple #17

0

Afficher le fichier

Fichier : HiveTransform.py Projet : alyiwang/WhereHows

  def transform(self, input, hive_instance, hive_metadata, hive_field_metadata, view_dependency):
    """
    convert from json to csv
    :param input: input json file
    :param hive_instance: output data file for hive instance
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    all_data = []
    with open(input) as input_file:
        for line in input_file:
            all_data.append(json.loads(line))

    dataset_idx = -1

    instance_file_writer = FileWriter(hive_instance)
    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)
    dependency_file_writer = FileWriter(view_dependency)

    depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        view_expanded_text = ''

        if TableInfo.view_expended_text in prop_json:
          view_expanded_text = prop_json[TableInfo.view_expended_text]
          text = prop_json[TableInfo.view_expended_text].replace('`', '')	# this will be fixed after switching to Hive AST
          array = []
          try:
            array = HiveViewDependency.getViewDependency(text)
          except:
            self.logger.error("HiveViewDependency.getViewDependency(%s) failed!" % (table['name']))

          l = []
          for a in array:
            l.append(a)
            names = str(a).split('.')
            if names and len(names) >= 2:
              db_name = names[0].lower()
              table_name = names[1].lower()
              if db_name and table_name:
                self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
                rows = self.curs.fetchall()
                self.conn_hms.commit()
                if rows and len(rows) > 0:
                  for row_index, row_value in enumerate(rows):
                    dependent_record = HiveDependencyInstanceRecord(
                                          one_db_info['type'],
                                          table['type'],
                                          "/%s/%s" % (one_db_info['database'], table['name']),
                                          'dalids:///' + one_db_info['database'] + '/' + table['dataset_name']
                                          if one_db_info['type'].lower() == 'dalids'
                                          else 'hive:///' + one_db_info['database'] + '/' + table['dataset_name'],
                                          'depends on',
                                          'Y',
                                          row_value[3],
                                          row_value[4],
                                          row_value[2],
                                          row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
                    dependency_file_writer.append(dependent_record)
          prop_json['view_depends_on'] = l
          dependency_file_writer.flush()

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and \
           table[TableInfo.schema_literal] is not None and \
           table[TableInfo.schema_literal].startswith('{'):
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          self.logger.info("Getting schema literal for: %s" % (urn))
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema Literal JSON error for table: " + str(table))

        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          if one_db_info['type'].lower() == 'dalids':
            uri = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          else:
            uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          self.logger.info("Getting column definition for: %s" % (uri))
          try:
            hcp = HiveColumnParser(table, urn = uri)
            schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
            field_detail_list += hcp.column_type_list
          except:
            self.logger.error("HiveColumnParser(%s) failed!" % (uri))
            schema_json = {'fields' : {}, 'type' : 'record', 'name' : table['name'], 'uri' : uri}

        if one_db_info['type'].lower() == 'dalids':
          dataset_urn = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
        else:
          dataset_urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])

        dataset_instance_record = DatasetInstanceRecord('dalids:///' + one_db_info['database'] + '/' + table['name']
                                                if one_db_info['type'].lower() == 'dalids'
                                                else 'hive:///' + one_db_info['database'] + '/' + table['name'],
                                                'grid',
                                                '',
                                                '',
                                                '*',
                                                True,
                                                table['native_name'],
                                                table['logical_name'],
                                                table['version'],
                                                table['create_time'],
                                                json.dumps(schema_json),
                                                json.dumps(view_expanded_text),
                                                dataset_urn)
        instance_file_writer.append(dataset_instance_record)

        if dataset_urn not in self.dataset_dict:
          dataset_scehma_record = DatasetSchemaRecord(table['dataset_name'], json.dumps(schema_json),
                                                      json.dumps(prop_json),
                                                      json.dumps(flds), dataset_urn, 'Hive', one_db_info['type'],
                                                      table['type'], '',
                                                      table.get(TableInfo.create_time),
                                                      (int(table.get(TableInfo.source_modified_time,"0"))))
          schema_file_writer.append(dataset_scehma_record)

          dataset_idx += 1
          self.dataset_dict[dataset_urn] = dataset_idx

          for fields in field_detail_list:
            field_record = DatasetFieldRecord(fields)
            field_file_writer.append(field_record)

      instance_file_writer.flush()
      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    instance_file_writer.close()
    schema_file_writer.close()
    field_file_writer.close()
    dependency_file_writer.close()

Exemple #18

0

Afficher le fichier

Fichier : TeradataTransform.py Projet : kiranclarivate/WhereHows_Bk

    def transform(self, input, td_metadata, td_field_metadata):
        '''
    convert from json to csv
    :param input: input json file
    :param td_metadata: output data file for teradata metadata
    :param td_field_metadata: output data file for teradata field metadata
    :return:
    '''
        f_json = open(input)
        data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(td_metadata)
        field_file_writer = FileWriter(td_field_metadata)

        for d in data:
            i = 0
            for k in d.keys():
                if k not in ['tables', 'views']:
                    continue
                self.logger.info("%s %4d %s" %
                                 (datetime.datetime.now().strftime("%H:%M:%S"),
                                  len(d[k]), k))
                for t in d[k]:
                    self.logger.info("%4d %s" % (i, t['name']))
                    if t['name'] == 'HDFStoTD_2464_ERR_1':
                        continue
                    i += 1
                    output = {}
                    prop_json = {}
                    output['name'] = t['name']
                    output['original_name'] = t['original_name']

                    prop_json["createTime"] = t["createTime"] if t.has_key(
                        "createTime") else None
                    prop_json[
                        "lastAlterTime"] = t["lastAlterTime"] if t.has_key(
                            "lastAlterTime") else None
                    prop_json[
                        "lastAccessTime"] = t["lastAccessTime"] if t.has_key(
                            "lastAccessTime") else None
                    prop_json["accessCount"] = t["accessCount"] if t.has_key(
                        "accessCount") else None
                    prop_json["sizeInMbytes"] = t["sizeInMbytes"] if t.has_key(
                        "sizeInMbytes") else None
                    if "type" in t:
                        prop_json["storage_type"] = t["type"]
                    if "partition" in t:
                        prop_json["partition"] = t["partition"]
                    if "partitions" in t:
                        prop_json["partitions"] = t["partitions"]
                    if "hashKey" in t:
                        prop_json["hashKey"] = t["hashKey"]
                    if "indices" in t:
                        prop_json["indices"] = t["indices"]
                    if "referenceTables" in t:
                        prop_json["referenceTables"] = t["referenceTables"]
                    if "viewSqlText" in t:
                        prop_json["viewSqlText"] = t["viewSqlText"]

                    output['fields'] = []
                    flds = {}
                    field_detail_list = []
                    sort_id = 0
                    for c in t['columns']:
                        # output['fields'].append(
                        #                    { 'name' : t['name'].encode('latin-1'),
                        #                      'type' : None if c['data_type'] is None else c['data_type'].encode('latin-1'),
                        #                      'attributes_json' : c}
                        #                output['fields'][c['name'].encode('latin-1')].append({ "doc" : "", "type" : [None if c['data_type'] is None else c['data_type'].encode('latin-1')]})
                        sort_id += 1
                        output['fields'].append({
                            "name":
                            c['name'],
                            "doc":
                            '',
                            "type":
                            c['dataType'] if c['dataType'] else None,
                            "nullable":
                            c['nullable'],
                            "maxByteLength":
                            c['maxByteLength'],
                            "format":
                            c['columnFormat']
                            if c.has_key('columnFormat') else None,
                            "accessCount":
                            c['accessCount']
                            if c.has_key('accessCount') else None,
                            "lastAccessTime":
                            c['lastAccessTime']
                            if c.has_key("lastAccessTime") else None
                        })

                        flds[c['name']] = {
                            'type': c['dataType'],
                            "maxByteLength": c['maxByteLength']
                        }

                        field_detail_list.append([
                            "teradata:///%s/%s" %
                            (d['database'], output['name']),
                            str(sort_id), '0', '', c['name'], '',
                            c['dataType'] if 'dataType' in c
                            and c['dataType'] is not None else '',
                            str(c['maxByteLength'])
                            if 'maxByteLength' in c else '0',
                            str(c['precision']) if 'precision' in c
                            and c['precision'] is not None else '',
                            str(c['scale'])
                            if 'scale' in c and c['scale'] is not None else '',
                            c['nullable'] if 'nullable' in c
                            and c['nullable'] is not None else 'Y', '', '', '',
                            '', '', '', ''
                        ])

                    dataset_scehma_record = DatasetSchemaRecord(
                        output['name'], json.dumps(output),
                        json.dumps(prop_json), json.dumps(flds),
                        "teradata:///%s/%s" % (d['database'], output['name']),
                        'Teradata', output['original_name'],
                        (self.convert_timestamp(t["createTime"])
                         if t.has_key("createTime") else None),
                        (self.convert_timestamp(t["lastAlterTime"])
                         if t.has_key("lastAlterTime") else None))
                    schema_file_writer.append(dataset_scehma_record)

                    for fields in field_detail_list:
                        field_record = DatasetFieldRecord(fields)
                        field_file_writer.append(field_record)

                schema_file_writer.flush()
                field_file_writer.flush()
                self.logger.info("%20s contains %6d %s" %
                                 (d['database'], i, k))

        schema_file_writer.close()
        field_file_writer.close()

Exemple #19

0

Afficher le fichier

Fichier : HiveTransform.py Projet : SunZhaonan/WhereHows-1

  def transform(self, input, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        if TableInfo.view_expended_text in prop_json:
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
          prop_json['view_depends_on'] = l

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema json error for table : \n" + str(table))
        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          hcp = HiveColumnParser(table, urn = uri)
          schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
          field_detail_list += hcp.column_type_list

        dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    "hive:///%s/%s" % (one_db_info['database'], table['name']), 'Hive',
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
        schema_file_writer.append(dataset_scehma_record)

        for fields in field_detail_list:
          field_record = DatasetFieldRecord(fields)
          field_file_writer.append(field_record)

      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    schema_file_writer.close()
    field_file_writer.close()

Exemple #20

0

Afficher le fichier

Fichier : AzkabanExtract.py Projet : wsw2008new/WhereHows

    def collect_flow_jobs(self, flow_file, job_file, dag_file):
        self.logger.info("collect flow&jobs")
        query = "SELECT distinct f.*, p.name as project_name FROM  project_flows f inner join projects p on f.project_id = p.id and f.version = p.version where p.active = 1"
        self.az_cursor.execute(query)
        rows = DbUtil.dict_cursor(self.az_cursor)
        flow_writer = FileWriter(flow_file)
        job_writer = FileWriter(job_file)
        dag_writer = FileWriter(dag_file)
        row_count = 0

        for row in rows:
            row['version'] = 0 if (row["version"] is None) else row["version"]

            json_column = 'json'
            unzipped_content = gzip.GzipFile(
                mode='r',
                fileobj=StringIO.StringIO(row[json_column].tostring())).read()
            try:
                row[json_column] = json.loads(unzipped_content)
            except:
                pass

            flow_path = row['project_name'] + ":" + row['flow_id']

            flow_record = AzkabanFlowRecord(self.app_id, row['flow_id'],
                                            row['project_name'], flow_path, 0,
                                            row['modified_time'] / 1000,
                                            row["version"], 'Y',
                                            self.wh_exec_id)
            flow_writer.append(flow_record)

            # get flow jobs
            nodes = row[json_column]['nodes']
            for node in nodes:
                job_record = AzkabanJobRecord(self.app_id, flow_path,
                                              row["version"], node['id'],
                                              flow_path + '/' + node['id'],
                                              node['jobType'], 'Y',
                                              self.wh_exec_id)
                if node['jobType'] == 'flow':
                    job_record.setRefFlowPath(row['project_name'] + ":" +
                                              node['embeddedFlowId'])
                job_writer.append(job_record)

            # job dag
            edges = row[json_column]['edges']
            for edge in edges:
                dag_edge = AzkabanFlowDagRecord(
                    self.app_id, flow_path, row['version'],
                    flow_path + '/' + edge['source'],
                    flow_path + '/' + edge['target'], self.wh_exec_id)
                dag_writer.append(dag_edge)

            row_count += 1

            if row_count % 1000 == 0:
                flow_writer.flush()
                job_writer.flush()
                dag_writer.flush()

        flow_writer.close()
        job_writer.close()
        dag_writer.close()

Exemple #21

0

Afficher le fichier

Fichier : HiveTransform.py Projet : BornOfHope/WhereHows

class HiveTransform:
  dataset_dict = {}
  def __init__(self):
    self.logger = LoggerFactory.getLogger('jython script : ' + self.__class__.__name__)
    username = args[Constant.HIVE_METASTORE_USERNAME]
    password = args[Constant.HIVE_METASTORE_PASSWORD]
    jdbc_driver = args[Constant.HIVE_METASTORE_JDBC_DRIVER]
    jdbc_url = args[Constant.HIVE_METASTORE_JDBC_URL]
    self.conn_hms = zxJDBC.connect(jdbc_url, username, password, jdbc_driver)
    self.curs = self.conn_hms.cursor()
    dependency_instance_file = args[Constant.HIVE_DEPENDENCY_CSV_FILE_KEY]
    self.instance_writer = FileWriter(dependency_instance_file)

  def transform(self, input, hive_instance, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    dataset_idx = -1

    instance_file_writer = FileWriter(hive_instance)
    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()
    depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        view_expanded_text = ''

        if TableInfo.view_expended_text in prop_json:
          view_expanded_text = prop_json[TableInfo.view_expended_text]
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
            names = str(a).split('.')
            if names and len(names) >= 2:
              db_name = names[0].lower()
              table_name = names[1].lower()
              if db_name and table_name:
                rows = []
                self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
                rows = self.curs.fetchall()
                if rows and len(rows) > 0:
                  for row_index, row_value in enumerate(rows):
                    dependent_record = HiveDependencyInstanceRecord(
                                          one_db_info['type'],
                                          table['type'],
                                          "/%s/%s" % (one_db_info['database'], table['name']),
                                          'dalids:///' + one_db_info['database'] + '/' + table['dataset_name']
                                          if one_db_info['type'].lower() == 'dalids'
                                          else 'hive:///' + one_db_info['database'] + '/' + table['dataset_name'],
                                          'depends on',
                                          'Y',
                                          row_value[3],
                                          row_value[4],
                                          row_value[2],
                                          row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
                    self.instance_writer.append(dependent_record)
          prop_json['view_depends_on'] = l
          self.instance_writer.flush()

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and \
           table[TableInfo.schema_literal] is not None and \
           table[TableInfo.schema_literal].startswith('{'):
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          self.logger.info("Getting schema literal for: %s" % (urn))
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema Literal JSON error for table: " + str(table))

        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          if one_db_info['type'].lower() == 'dalids':
            uri = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          else:
            uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          self.logger.info("Getting column definition for: %s" % (uri))
          hcp = HiveColumnParser(table, urn = uri)
          schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
          field_detail_list += hcp.column_type_list

        if one_db_info['type'].lower() == 'dalids':
          dataset_urn = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
        else:
          dataset_urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])

        dataset_instance_record = DatasetInstanceRecord('dalids:///' + one_db_info['database'] + '/' + table['name']
                                                if one_db_info['type'].lower() == 'dalids'
                                                else 'hive:///' + one_db_info['database'] + '/' + table['name'],
                                                'grid',
                                                'eat1',
                                                'eat1-nertz',
                                                '*',
                                                0,
                                                table['native_name'],
                                                table['logical_name'],
                                                table['version'],
                                                table['create_time'],
                                                json.dumps(schema_json),
                                                json.dumps(view_expanded_text),
                                                dataset_urn)
        instance_file_writer.append(dataset_instance_record)

        if dataset_urn not in self.dataset_dict:
          dataset_scehma_record = DatasetSchemaRecord(table['dataset_name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    dataset_urn,
                                                    'Hive', one_db_info['type'], table['type'],
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
          schema_file_writer.append(dataset_scehma_record)

          dataset_idx += 1
          self.dataset_dict[dataset_urn] = dataset_idx

          for fields in field_detail_list:
            field_record = DatasetFieldRecord(fields)
            field_file_writer.append(field_record)



      instance_file_writer.flush()
      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    instance_file_writer.close()
    schema_file_writer.close()
    field_file_writer.close()

  def convert_timestamp(self, time_string):
    return int(time.mktime(time.strptime(time_string, "%Y-%m-%d %H:%M:%S")))