Beispiel #1
0
  def format_table_metadata_v2(self, rows, schema):
    """
    process info get from COLUMN_V2 into final table, several lines form one table info
    :param rows: the info get from COLUMN_V2 table, order by database name, table name
    :param schema: {database : _, type : _, tables : [{}, {} ...] }
    :return:
    """
    db_idx = len(schema) - 1
    table_idx = -1
    dataset_idx = -1
    dataset_urn_idx = -1

    field_list = []
    for row_index, row_value in enumerate(rows):
      field_list.append({'IntegerIndex': row_value[14], 'ColumnName': row_value[15], 'TypeName': row_value[16],
                         'Comment': row_value[17]})
      if row_index == len(rows) - 1 or (row_value[0] != rows[row_index+1][0] or row_value[1] != rows[row_index+1][1]): # if this is last record of current table
        # sort the field_list by IntegerIndex
        field_list = sorted(field_list, key=lambda k: k['IntegerIndex'])
        # process the record of table

        if row_value[20].lower() == 'dalids':
          urn = 'dalids:///' + row_value[0] + '/' + row_value[18]
          instance_record = DatasetInstanceRecord(row_value[25],
                                      long(self.db_id),
                                      'grid',
                                      'eat1',
                                      'eat1-nertz',
                                      '*',
                                      0,
                                      row_value[22],
                                      row_value[23],
                                      str(row_value[19]),
                                      long(row_value[3]),
                                      long(row_value[24]),
                                      self.wh_exec_id,
                                      'dalids:///' + row_value[0] + '/' + row_value[18])
          self.instance_writer.append(instance_record)
          dataset_urn_idx += 1
          self.instance_dict[row_value[25]] = dataset_urn_idx
        else:
          urn = 'hive:///' + row_value[0] + '/' + row_value[18]

        if urn in self.dataset_dict:
          continue

        table_record = {TableInfo.table_name: row_value[18], TableInfo.type: row_value[21], TableInfo.serialization_format: row_value[2],
                      TableInfo.create_time: row_value[3], TableInfo.db_id: row_value[4], TableInfo.table_id: row_value[5],
                      TableInfo.serde_id: row_value[6], TableInfo.location: row_value[7], TableInfo.table_type: row_value[8],
                      TableInfo.view_expended_text: row_value[9], TableInfo.input_format: row_value[10], TableInfo.output_format: row_value[11],
                      TableInfo.is_compressed: row_value[12], TableInfo.is_storedassubdirectories: row_value[13],
                      TableInfo.etl_source: 'COLUMN_V2', TableInfo.field_list: field_list[:]}
        dataset_idx += 1
        self.dataset_dict[urn] = dataset_idx
        field_list = [] # empty it

        if row_value[0] not in self.db_dict:
          schema.append({'database': row_value[0], 'type': row_value[20], 'tables': []})
          db_idx += 1
          self.db_dict[row_value[0]] = db_idx

        full_name = row_value[0] + '.' + row_value[1]

        # put in schema result
        if full_name not in self.table_dict:
          schema[db_idx]['tables'].append(table_record)
          table_idx += 1
          self.table_dict[full_name] = table_idx

    self.instance_writer.flush()
    self.logger.info("%s %6d tables processed for database %12s from COLUMN_V2" % (
      datetime.datetime.now(), table_idx + 1, row_value[0]))
Beispiel #2
0
    def transform(self, input, hive_instance, hive_metadata,
                  hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        all_data = []
        with open(input) as input_file:
            for line in input_file:
                all_data.append(json.loads(line))

        dataset_idx = -1

        instance_file_writer = FileWriter(hive_instance)
        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()
        depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                view_expanded_text = ''

                if TableInfo.view_expended_text in prop_json:
                    view_expanded_text = prop_json[
                        TableInfo.view_expended_text]
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`',
                        '')  # this will be fixed after switching to Hive AST
                    array = []
                    try:
                        array = HiveViewDependency.getViewDependency(text)
                    except:
                        self.logger.error(
                            "HiveViewDependency.getViewDependency(%s) failed!"
                            % (table['name']))

                    l = []
                    for a in array:
                        l.append(a)
                        names = str(a).split('.')
                        if names and len(names) >= 2:
                            db_name = names[0].lower()
                            table_name = names[1].lower()
                            if db_name and table_name:
                                self.curs.execute(
                                    depends_sql.format(db_name=db_name,
                                                       table_name=table_name,
                                                       version='{version}'))
                                rows = self.curs.fetchall()
                                self.conn_hms.commit()
                                if rows and len(rows) > 0:
                                    for row_index, row_value in enumerate(
                                            rows):
                                        dependent_record = HiveDependencyInstanceRecord(
                                            one_db_info['type'], table['type'],
                                            "/%s/%s" %
                                            (one_db_info['database'],
                                             table['name']), 'dalids:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name']
                                            if one_db_info['type'].lower()
                                            == 'dalids' else 'hive:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name'],
                                            'depends on', 'Y', row_value[3],
                                            row_value[4], row_value[2],
                                            row_value[5] + ':///' +
                                            row_value[0] + '/' + row_value[1],
                                            '')
                                        self.instance_writer.append(
                                            dependent_record)
                    prop_json['view_depends_on'] = l
                    self.instance_writer.flush()

                # process either schema
                flds = {}
                field_detail_list = []

                if TableInfo.schema_literal in table and \
                   table[TableInfo.schema_literal] is not None and \
                   table[TableInfo.schema_literal].startswith('{'):
                    sort_id = 0
                    urn = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    self.logger.info("Getting schema literal for: %s" % (urn))
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                        schema_json = schema_data
                        acp = AvroColumnParser(schema_data, urn=urn)
                        result = acp.get_column_list_result()
                        field_detail_list += result
                    except ValueError:
                        self.logger.error(
                            "Schema Literal JSON error for table: " +
                            str(table))

                elif TableInfo.field_list in table:
                    # Convert to avro
                    uri = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    if one_db_info['type'].lower() == 'dalids':
                        uri = "dalids:///%s/%s" % (one_db_info['database'],
                                                   table['dataset_name'])
                    else:
                        uri = "hive:///%s/%s" % (one_db_info['database'],
                                                 table['dataset_name'])
                    self.logger.info("Getting column definition for: %s" %
                                     (uri))
                    try:
                        hcp = HiveColumnParser(table, urn=uri)
                        schema_json = {
                            'fields': hcp.column_type_dict['fields'],
                            'type': 'record',
                            'name': table['name'],
                            'uri': uri
                        }
                        field_detail_list += hcp.column_type_list
                    except:
                        self.logger.error("HiveColumnParser(%s) failed!" %
                                          (uri))
                        schema_json = {
                            'fields': {},
                            'type': 'record',
                            'name': table['name'],
                            'uri': uri
                        }

                if one_db_info['type'].lower() == 'dalids':
                    dataset_urn = "dalids:///%s/%s" % (one_db_info['database'],
                                                       table['dataset_name'])
                else:
                    dataset_urn = "hive:///%s/%s" % (one_db_info['database'],
                                                     table['dataset_name'])

                dataset_instance_record = DatasetInstanceRecord(
                    'dalids:///' + one_db_info['database'] + '/' +
                    table['name'] if one_db_info['type'].lower() == 'dalids'
                    else 'hive:///' + one_db_info['database'] + '/' +
                    table['name'], 'grid', '', '', '*', 0,
                    table['native_name'], table['logical_name'],
                    table['version'], table['create_time'],
                    json.dumps(schema_json), json.dumps(view_expanded_text),
                    dataset_urn)
                instance_file_writer.append(dataset_instance_record)

                if dataset_urn not in self.dataset_dict:
                    dataset_scehma_record = DatasetSchemaRecord(
                        table['dataset_name'], json.dumps(schema_json),
                        json.dumps(prop_json), json.dumps(flds), dataset_urn,
                        'Hive', one_db_info['type'], table['type'], '',
                        table.get(TableInfo.create_time),
                        (int(table.get(TableInfo.source_modified_time, "0"))))
                    schema_file_writer.append(dataset_scehma_record)

                    dataset_idx += 1
                    self.dataset_dict[dataset_urn] = dataset_idx

                    for fields in field_detail_list:
                        field_record = DatasetFieldRecord(fields)
                        field_file_writer.append(field_record)

            instance_file_writer.flush()
            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        instance_file_writer.close()
        schema_file_writer.close()
        field_file_writer.close()