Python AvroColumnParser 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: AvroColumnParser

클래스/타입: AvroColumnParser

hotexamples.com에서의 예제들: 9

Python AvroColumnParser - 9개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 AvroColumnParser.AvroColumnParser에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

AvroColumnParser(5)

get_column_list_result(5)

예제 #1

파일 보기

파일: HBaseTransform.py 프로젝트: thomas-young-2013/Wherehows-eXtension

    def transform(this, raw_metadata, metadata_output, field_metadata_output):
        input_json_file = open(raw_metadata, 'r')
        schema_file_writer = FileWriter(metadata_output)
        field_file_writer = FileWriter(field_metadata_output)
        i = 0
        this.sort_id = 0
        o_urn = ''
        for line in input_json_file:
            try:
                j = json.loads(line)
            except:
                this.logger.error("   Invalid JSON:\n%s" % line)
                continue
            i += 1
            o_field_list_ = []
            this.sort_id = 0
            if not j.has_key('attributes'):
                o_properties = {"doc": null}
            else:
                o_properties = dict(j['attributes'].items())
                del j['attributes']
            if j.has_key('uri'):
                o_urn = j['uri']
                o_name = o_urn[o_urn.rfind('/') + 1:]
                o_source = 'Hbase'
            else:
                this.logger.info('*** Warning: "uri" is not found in %s' %
                                 j['name'])
                o_urn = ''
                o_name = ''
            if not j.has_key('fields'):
                o_fields = {"doc": None}
            else:
                o_fields = {}
                for f in j['fields']:
                    o_field_name = f['name']
                    o_fields[o_field_name] = dict(f)
                acp = AvroColumnParser(j, o_urn)
                o_field_list_ += acp.get_column_list_result()
            dataset_schema_record = DatasetSchemaRecord(
                o_name, json.dumps(j, sort_keys=True),
                json.dumps(o_properties, sort_keys=True), json.dumps(o_fields),
                o_urn, o_source, 'HBase', 'Table', None, None, None)
            schema_file_writer.append(dataset_schema_record)
            for fields in o_field_list_:
                field_record = DatasetFieldRecord(fields)
                field_file_writer.append(field_record)

        field_file_writer.close()
        schema_file_writer.close()
        input_json_file.close()

예제 #2

파일 보기

파일: AvroColumnParserTest.py 프로젝트: cswaroop/WhereHows

 def test_parse_complex(self):
   avro_json = json.loads(sample_avro_with_complex_field)
   acp = AvroColumnParser(avro_json)
   result = acp.get_column_list_result()
   self.assertEqual(result, expect_result)

예제 #3

파일 보기

파일: HiveTransform.py 프로젝트: mt0803/WhereHows

    def transform(self, input, hive_metadata, hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        f_json = open(input)
        all_data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                if TableInfo.view_expended_text in prop_json:
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`', '')
                    array = HiveViewDependency.getViewDependency(text)
                    l = []
                    for a in array:
                        l.append(a)
                    prop_json['view_depends_on'] = l

                # process either schema
                flds = {}
                field_detail_list = []

                if TableInfo.schema_literal in table and table[
                        TableInfo.schema_literal] is not None:
                    sort_id = 0
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                    except ValueError:
                        self.logger.error("Schema json error for table : \n" +
                                          str(table))
                    schema_json = schema_data
                    # extract fields to field record
                    urn = "hive:///%s/%s" % (one_db_info['database'],
                                             table['name'])
                    acp = AvroColumnParser(schema_data, urn=urn)
                    result = acp.get_column_list_result()
                    field_detail_list += result

                elif TableInfo.field_list in table:
                    # Convert to avro
                    uri = "hive:///%s/%s" % (one_db_info['database'],
                                             table['name'])
                    hcp = HiveColumnParser(table, urn=uri)
                    schema_json = {
                        'fields': hcp.column_type_dict['fields'],
                        'type': 'record',
                        'name': table['name'],
                        'uri': uri
                    }
                    field_detail_list += hcp.column_type_list

                dataset_scehma_record = DatasetSchemaRecord(
                    table['name'], json.dumps(schema_json),
                    json.dumps(prop_json), json.dumps(flds),
                    "hive:///%s/%s" % (one_db_info['database'], table['name']),
                    'Hive', '', (table[TableInfo.create_time] if table.has_key(
                        TableInfo.create_time) else None),
                    (table["lastAlterTime"])
                    if table.has_key("lastAlterTime") else None)
                schema_file_writer.append(dataset_scehma_record)

                for fields in field_detail_list:
                    field_record = DatasetFieldRecord(fields)
                    field_file_writer.append(field_record)

            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        schema_file_writer.close()
        field_file_writer.close()

예제 #4

파일 보기

파일: AvroColumnParserTest.py 프로젝트: thomas-young-2013/Wherehows-eXtension

 def test_parse_complex(self):
     avro_json = json.loads(sample_avro_with_complex_field)
     acp = AvroColumnParser(avro_json)
     result = acp.get_column_list_result()
     self.assertEqual(result, expect_result)

예제 #5

파일 보기

    def transform(self, input, hive_instance, hive_metadata,
                  hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        all_data = []
        with open(input) as input_file:
            for line in input_file:
                all_data.append(json.loads(line))

        dataset_idx = -1

        instance_file_writer = FileWriter(hive_instance)
        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()
        depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                view_expanded_text = ''

                if TableInfo.view_expended_text in prop_json:
                    view_expanded_text = prop_json[
                        TableInfo.view_expended_text]
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`',
                        '')  # this will be fixed after switching to Hive AST
                    array = []
                    try:
                        array = HiveViewDependency.getViewDependency(text)
                    except:
                        self.logger.error(
                            "HiveViewDependency.getViewDependency(%s) failed!"
                            % (table['name']))

                    l = []
                    for a in array:
                        l.append(a)
                        names = str(a).split('.')
                        if names and len(names) >= 2:
                            db_name = names[0].lower()
                            table_name = names[1].lower()
                            if db_name and table_name:
                                self.curs.execute(
                                    depends_sql.format(db_name=db_name,
                                                       table_name=table_name,
                                                       version='{version}'))
                                rows = self.curs.fetchall()
                                self.conn_hms.commit()
                                if rows and len(rows) > 0:
                                    for row_index, row_value in enumerate(
                                            rows):
                                        dependent_record = HiveDependencyInstanceRecord(
                                            one_db_info['type'], table['type'],
                                            "/%s/%s" %
                                            (one_db_info['database'],
                                             table['name']), 'dalids:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name']
                                            if one_db_info['type'].lower()
                                            == 'dalids' else 'hive:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name'],
                                            'depends on', 'Y', row_value[3],
                                            row_value[4], row_value[2],
                                            row_value[5] + ':///' +
                                            row_value[0] + '/' + row_value[1],
                                            '')
                                        self.instance_writer.append(
                                            dependent_record)
                    prop_json['view_depends_on'] = l
                    self.instance_writer.flush()

                # process either schema
                flds = {}
                field_detail_list = []

                if TableInfo.schema_literal in table and \
                   table[TableInfo.schema_literal] is not None and \
                   table[TableInfo.schema_literal].startswith('{'):
                    sort_id = 0
                    urn = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    self.logger.info("Getting schema literal for: %s" % (urn))
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                        schema_json = schema_data
                        acp = AvroColumnParser(schema_data, urn=urn)
                        result = acp.get_column_list_result()
                        field_detail_list += result
                    except ValueError:
                        self.logger.error(
                            "Schema Literal JSON error for table: " +
                            str(table))

                elif TableInfo.field_list in table:
                    # Convert to avro
                    uri = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    if one_db_info['type'].lower() == 'dalids':
                        uri = "dalids:///%s/%s" % (one_db_info['database'],
                                                   table['dataset_name'])
                    else:
                        uri = "hive:///%s/%s" % (one_db_info['database'],
                                                 table['dataset_name'])
                    self.logger.info("Getting column definition for: %s" %
                                     (uri))
                    try:
                        hcp = HiveColumnParser(table, urn=uri)
                        schema_json = {
                            'fields': hcp.column_type_dict['fields'],
                            'type': 'record',
                            'name': table['name'],
                            'uri': uri
                        }
                        field_detail_list += hcp.column_type_list
                    except:
                        self.logger.error("HiveColumnParser(%s) failed!" %
                                          (uri))
                        schema_json = {
                            'fields': {},
                            'type': 'record',
                            'name': table['name'],
                            'uri': uri
                        }

                if one_db_info['type'].lower() == 'dalids':
                    dataset_urn = "dalids:///%s/%s" % (one_db_info['database'],
                                                       table['dataset_name'])
                else:
                    dataset_urn = "hive:///%s/%s" % (one_db_info['database'],
                                                     table['dataset_name'])

                dataset_instance_record = DatasetInstanceRecord(
                    'dalids:///' + one_db_info['database'] + '/' +
                    table['name'] if one_db_info['type'].lower() == 'dalids'
                    else 'hive:///' + one_db_info['database'] + '/' +
                    table['name'], 'grid', '', '', '*', 0,
                    table['native_name'], table['logical_name'],
                    table['version'], table['create_time'],
                    json.dumps(schema_json), json.dumps(view_expanded_text),
                    dataset_urn)
                instance_file_writer.append(dataset_instance_record)

                if dataset_urn not in self.dataset_dict:
                    dataset_scehma_record = DatasetSchemaRecord(
                        table['dataset_name'], json.dumps(schema_json),
                        json.dumps(prop_json), json.dumps(flds), dataset_urn,
                        'Hive', one_db_info['type'], table['type'], '',
                        table.get(TableInfo.create_time),
                        (int(table.get(TableInfo.source_modified_time, "0"))))
                    schema_file_writer.append(dataset_scehma_record)

                    dataset_idx += 1
                    self.dataset_dict[dataset_urn] = dataset_idx

                    for fields in field_detail_list:
                        field_record = DatasetFieldRecord(fields)
                        field_file_writer.append(field_record)

            instance_file_writer.flush()
            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        instance_file_writer.close()
        schema_file_writer.close()
        field_file_writer.close()

예제 #6

파일 보기

파일: HiveTransform.py 프로젝트: CSRedRat/WhereHows

  def transform(self, input, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()
    depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        if TableInfo.view_expended_text in prop_json:
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
            names = str(a).split('.')
            if names and len(names) >= 2:
              db_name = names[0]
              table_name = names[1]
              if db_name and table_name:
                rows = []
                self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
                rows = self.curs.fetchall()
                if rows and len(rows) > 0:
                  for row_index, row_value in enumerate(rows):
                    dependent_record = HiveDependencyInstanceRecord(
                                          one_db_info['type'],
                                          table['type'],
                                          "/%s/%s" % (one_db_info['database'], table['name']),
                                          'dalids:///' + one_db_info['database'] + '/' + table['name']
                                          if one_db_info['type'].lower() == 'dalids'
                                          else 'hive:///' + one_db_info['database'] + '/' + table['name'],
                                          'depends on',
                                          'is used by',
                                          row_value[3],
                                          row_value[4],
                                          row_value[2],
                                          row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
                    self.instance_writer.append(dependent_record)
          prop_json['view_depends_on'] = l
          self.instance_writer.flush()

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema json error for table : \n" + str(table))

        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          if one_db_info['type'].lower() == 'dalids':
            uri = "dalids:///%s/%s" % (one_db_info['database'], table['name'])
          else:
            uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          hcp = HiveColumnParser(table, urn = uri)
          schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
          field_detail_list += hcp.column_type_list

        if one_db_info['type'].lower() == 'dalids':
          dataset_urn = "dalids:///%s/%s" % (one_db_info['database'], table['name'])
        else:
          dataset_urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
        dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    dataset_urn,
                                                    'Hive', one_db_info['type'], table['type'],
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
        schema_file_writer.append(dataset_scehma_record)

        for fields in field_detail_list:
          field_record = DatasetFieldRecord(fields)
          field_file_writer.append(field_record)

      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    schema_file_writer.close()
    field_file_writer.close()

예제 #7

파일 보기

파일: HiveTransform.py 프로젝트: SunZhaonan/WhereHows-1

  def transform(self, input, hive_metadata, hive_field_metadata):
    """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    f_json = open(input)
    all_data = json.load(f_json)
    f_json.close()

    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)

    lineageInfo = LineageInfo()

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        if TableInfo.view_expended_text in prop_json:
          text = prop_json[TableInfo.view_expended_text].replace('`', '')
          array = HiveViewDependency.getViewDependency(text)
          l = []
          for a in array:
            l.append(a)
          prop_json['view_depends_on'] = l

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and table[TableInfo.schema_literal] is not None:
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema json error for table : \n" + str(table))
        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['name'])
          hcp = HiveColumnParser(table, urn = uri)
          schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
          field_detail_list += hcp.column_type_list

        dataset_scehma_record = DatasetSchemaRecord(table['name'], json.dumps(schema_json), json.dumps(prop_json),
                                                    json.dumps(flds),
                                                    "hive:///%s/%s" % (one_db_info['database'], table['name']), 'Hive',
                                                    '', (table[TableInfo.create_time] if table.has_key(
            TableInfo.create_time) else None), (table["lastAlterTime"]) if table.has_key("lastAlterTime") else None)
        schema_file_writer.append(dataset_scehma_record)

        for fields in field_detail_list:
          field_record = DatasetFieldRecord(fields)
          field_file_writer.append(field_record)

      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    schema_file_writer.close()
    field_file_writer.close()

예제 #8

파일 보기

파일: HiveTransform.py 프로젝트: alyiwang/WhereHows

  def transform(self, input, hive_instance, hive_metadata, hive_field_metadata, view_dependency):
    """
    convert from json to csv
    :param input: input json file
    :param hive_instance: output data file for hive instance
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
    all_data = []
    with open(input) as input_file:
        for line in input_file:
            all_data.append(json.loads(line))

    dataset_idx = -1

    instance_file_writer = FileWriter(hive_instance)
    schema_file_writer = FileWriter(hive_metadata)
    field_file_writer = FileWriter(hive_field_metadata)
    dependency_file_writer = FileWriter(view_dependency)

    depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

    # one db info : 'type', 'database', 'tables'
    # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
    #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
    for one_db_info in all_data:
      i = 0
      for table in one_db_info['tables']:
        i += 1
        schema_json = {}
        prop_json = {}  # set the prop json

        for prop_name in TableInfo.optional_prop:
          if prop_name in table and table[prop_name] is not None:
            prop_json[prop_name] = table[prop_name]

        view_expanded_text = ''

        if TableInfo.view_expended_text in prop_json:
          view_expanded_text = prop_json[TableInfo.view_expended_text]
          text = prop_json[TableInfo.view_expended_text].replace('`', '')	# this will be fixed after switching to Hive AST
          array = []
          try:
            array = HiveViewDependency.getViewDependency(text)
          except:
            self.logger.error("HiveViewDependency.getViewDependency(%s) failed!" % (table['name']))

          l = []
          for a in array:
            l.append(a)
            names = str(a).split('.')
            if names and len(names) >= 2:
              db_name = names[0].lower()
              table_name = names[1].lower()
              if db_name and table_name:
                self.curs.execute(depends_sql.format(db_name=db_name, table_name=table_name, version='{version}'))
                rows = self.curs.fetchall()
                self.conn_hms.commit()
                if rows and len(rows) > 0:
                  for row_index, row_value in enumerate(rows):
                    dependent_record = HiveDependencyInstanceRecord(
                                          one_db_info['type'],
                                          table['type'],
                                          "/%s/%s" % (one_db_info['database'], table['name']),
                                          'dalids:///' + one_db_info['database'] + '/' + table['dataset_name']
                                          if one_db_info['type'].lower() == 'dalids'
                                          else 'hive:///' + one_db_info['database'] + '/' + table['dataset_name'],
                                          'depends on',
                                          'Y',
                                          row_value[3],
                                          row_value[4],
                                          row_value[2],
                                          row_value[5] + ':///' + row_value[0] + '/' + row_value[1], '')
                    dependency_file_writer.append(dependent_record)
          prop_json['view_depends_on'] = l
          dependency_file_writer.flush()

        # process either schema
        flds = {}
        field_detail_list = []

        if TableInfo.schema_literal in table and \
           table[TableInfo.schema_literal] is not None and \
           table[TableInfo.schema_literal].startswith('{'):
          sort_id = 0
          urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          self.logger.info("Getting schema literal for: %s" % (urn))
          try:
            schema_data = json.loads(table[TableInfo.schema_literal])
            schema_json = schema_data
            acp = AvroColumnParser(schema_data, urn = urn)
            result = acp.get_column_list_result()
            field_detail_list += result
          except ValueError:
            self.logger.error("Schema Literal JSON error for table: " + str(table))

        elif TableInfo.field_list in table:
          # Convert to avro
          uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          if one_db_info['type'].lower() == 'dalids':
            uri = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          else:
            uri = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])
          self.logger.info("Getting column definition for: %s" % (uri))
          try:
            hcp = HiveColumnParser(table, urn = uri)
            schema_json = {'fields' : hcp.column_type_dict['fields'], 'type' : 'record', 'name' : table['name'], 'uri' : uri}
            field_detail_list += hcp.column_type_list
          except:
            self.logger.error("HiveColumnParser(%s) failed!" % (uri))
            schema_json = {'fields' : {}, 'type' : 'record', 'name' : table['name'], 'uri' : uri}

        if one_db_info['type'].lower() == 'dalids':
          dataset_urn = "dalids:///%s/%s" % (one_db_info['database'], table['dataset_name'])
        else:
          dataset_urn = "hive:///%s/%s" % (one_db_info['database'], table['dataset_name'])

        dataset_instance_record = DatasetInstanceRecord('dalids:///' + one_db_info['database'] + '/' + table['name']
                                                if one_db_info['type'].lower() == 'dalids'
                                                else 'hive:///' + one_db_info['database'] + '/' + table['name'],
                                                'grid',
                                                '',
                                                '',
                                                '*',
                                                True,
                                                table['native_name'],
                                                table['logical_name'],
                                                table['version'],
                                                table['create_time'],
                                                json.dumps(schema_json),
                                                json.dumps(view_expanded_text),
                                                dataset_urn)
        instance_file_writer.append(dataset_instance_record)

        if dataset_urn not in self.dataset_dict:
          dataset_scehma_record = DatasetSchemaRecord(table['dataset_name'], json.dumps(schema_json),
                                                      json.dumps(prop_json),
                                                      json.dumps(flds), dataset_urn, 'Hive', one_db_info['type'],
                                                      table['type'], '',
                                                      table.get(TableInfo.create_time),
                                                      (int(table.get(TableInfo.source_modified_time,"0"))))
          schema_file_writer.append(dataset_scehma_record)

          dataset_idx += 1
          self.dataset_dict[dataset_urn] = dataset_idx

          for fields in field_detail_list:
            field_record = DatasetFieldRecord(fields)
            field_file_writer.append(field_record)

      instance_file_writer.flush()
      schema_file_writer.flush()
      field_file_writer.flush()
      self.logger.info("%20s contains %6d tables" % (one_db_info['database'], i))

    instance_file_writer.close()
    schema_file_writer.close()
    field_file_writer.close()
    dependency_file_writer.close()

예제 #9

파일 보기

파일: HdfsTransform.py 프로젝트: thomas-young-2013/Wherehows-eXtension

    def transform(self, raw_metadata, metadata_output, field_metadata_output):

        # sys.setdefaultencoding("UTF-8")

        input_json_file = open(raw_metadata, 'r')
        schema_file_writer = FileWriter(metadata_output)
        field_file_writer = FileWriter(field_metadata_output)
        i = 0
        self.sort_id = 0
        o_urn = ''
        p = ''

        for line in input_json_file:
            try:
                j = json.loads(line)
            except:
                self.logger.error("    Invalid JSON:\n%s" % line)
                continue

            i += 1
            o_field_list_ = []
            parent_field_path = ''
            self.sort_id = 0

            if not (j.has_key('attributes_json') or j.has_key('attributes')):
                o_properties = {"doc": null}
            else:
                o_properties = {}
                if j.has_key('attributes_json'):
                    o_properties = json.loads(j['attributes_json'])
                    del j['attributes_json']
                if j.has_key('attributes'):
                    o_properties = dict(j['attributes'].items() +
                                        o_properties.items())
                    del j['attributes']

            if j.has_key('uri'):
                o_urn = j['uri']
            elif o_properties.has_key('uri'):
                o_urn = o_properties['uri']
            else:
                self.logger.info('*** Warning: "uri" is not found in %s' %
                                 j['name'])
                o_urn = ''

            if o_urn.find('hdfs://') == 0:
                o_name = o_urn[o_urn.rfind('/') + 1:]
            elif o_properties.has_key('table_name'):
                o_name = o_properties['table_name']
            elif j.has_key('name') and j['name'][0:5] != 'TUPLE':
                o_name = j['name']
            else:
                o_name = o_urn[o_urn.rfind('/') + 1:]

            if j.has_key('id') or not j.has_key('fields'):  # esWritable schema
                o_fields = {}
                for k in j:
                    if not (k == 'uri' or k == 'attributes' or k == 'doc'):
                        if type(j[k]) == list:
                            o_fields[k] = {
                                "name": k,
                                "type": 'list',
                                "doc": str(j[k])
                            }
                        elif type(j[k]) == dict:
                            o_fields[k] = {
                                "name": k,
                                "type": 'dict',
                                "doc": str(j[k])
                            }
                        else:
                            o_fields[k] = {
                                "name": k,
                                "type": j[k],
                                "doc": None
                            }

                        self.sort_id += 1
                        o_field_list_.append([
                            o_urn, self.sort_id, 0, '', k, o_fields[k]['type'],
                            '', '', '', o_fields[k]['doc'].replace("\n", ' ')
                            if o_fields[k]['doc'] is not None else None
                        ])

            elif j.has_key('fields'):
                o_fields = {}
                for f in j['fields']:
                    o_field_name = f['name']
                    o_fields[o_field_name] = dict(f)  # for schema output
                    if f.has_key('attributes_json'):
                        f['attributes'] = json.loads(f['attributes_json'])
                        del f['attributes_json']

                acp = AvroColumnParser(j, o_urn)
                o_field_list_ += acp.get_column_list_result()

            else:
                o_fields = {"doc": None}

            if j.has_key('attributes') and not o_properties.has_key('source'):
                o_properties['source'] = j['attributes']['source']

            if o_urn.startswith(
                    'hdfs:///') and self.file_regex_source_map is not None:
                o_source = self.get_source(o_urn[7:])
            else:
                self.logger.warn(
                    "property : " + Constant.HDFS_FILE_SOURCE_MAP_KEY +
                    " is None, will use default source for all dataset")
                o_source = 'Hdfs'

            self.logger.info(
                "%4i (%6i): %4i fields, %4i total fields(including nested) found in [%s]@%s with source %s"
                % (i, len(j), len(o_fields), len(o_field_list_), o_name, o_urn,
                   o_source))

            dataset_schema_record = DatasetSchemaRecord(
                o_name, json.dumps(j, sort_keys=True),
                json.dumps(o_properties, sort_keys=True), json.dumps(o_fields),
                o_urn, o_source, None, None, None)
            schema_file_writer.append(dataset_schema_record)

            for fields in o_field_list_:
                field_record = DatasetFieldRecord(fields)
                field_file_writer.append(field_record)

        schema_file_writer.close()
        field_file_writer.close()
        input_json_file.close()