def transform(this, raw_metadata, metadata_output, field_metadata_output):
        input_json_file = open(raw_metadata, 'r')
        schema_file_writer = FileWriter(metadata_output)
        field_file_writer = FileWriter(field_metadata_output)
        i = 0
        this.sort_id = 0
        o_urn = ''
        for line in input_json_file:
            try:
                j = json.loads(line)
            except:
                this.logger.error("   Invalid JSON:\n%s" % line)
                continue
            i += 1
            o_field_list_ = []
            this.sort_id = 0
            if not j.has_key('attributes'):
                o_properties = {"doc": null}
            else:
                o_properties = dict(j['attributes'].items())
                del j['attributes']
            if j.has_key('uri'):
                o_urn = j['uri']
                o_name = o_urn[o_urn.rfind('/') + 1:]
                o_source = 'Hbase'
            else:
                this.logger.info('*** Warning: "uri" is not found in %s' %
                                 j['name'])
                o_urn = ''
                o_name = ''
            if not j.has_key('fields'):
                o_fields = {"doc": None}
            else:
                o_fields = {}
                for f in j['fields']:
                    o_field_name = f['name']
                    o_fields[o_field_name] = dict(f)
                acp = AvroColumnParser(j, o_urn)
                o_field_list_ += acp.get_column_list_result()
            dataset_schema_record = DatasetSchemaRecord(
                o_name, json.dumps(j, sort_keys=True),
                json.dumps(o_properties, sort_keys=True), json.dumps(o_fields),
                o_urn, o_source, 'HBase', 'Table', None, None, None)
            schema_file_writer.append(dataset_schema_record)
            for fields in o_field_list_:
                field_record = DatasetFieldRecord(fields)
                field_file_writer.append(field_record)

        field_file_writer.close()
        schema_file_writer.close()
        input_json_file.close()
Ejemplo n.º 2
0
    def transform(self, input, hive_metadata, hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        f_json = open(input)
        all_data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                if TableInfo.view_expended_text in prop_json:
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`', '')
                    array = HiveViewDependency.getViewDependency(text)
                    l = []
                    for a in array:
                        l.append(a)
                    prop_json['view_depends_on'] = l

                # process either schema
                flds = {}
                field_detail_list = []

                if TableInfo.schema_literal in table and table[
                        TableInfo.schema_literal] is not None:
                    sort_id = 0
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                    except ValueError:
                        self.logger.error("Schema json error for table : \n" +
                                          str(table))
                    schema_json = schema_data
                    # extract fields to field record
                    urn = "hive:///%s/%s" % (one_db_info['database'],
                                             table['name'])
                    acp = AvroColumnParser(schema_data, urn=urn)
                    result = acp.get_column_list_result()
                    field_detail_list += result

                elif TableInfo.field_list in table:
                    # Convert to avro
                    uri = "hive:///%s/%s" % (one_db_info['database'],
                                             table['name'])
                    hcp = HiveColumnParser(table, urn=uri)
                    schema_json = {
                        'fields': hcp.column_type_dict['fields'],
                        'type': 'record',
                        'name': table['name'],
                        'uri': uri
                    }
                    field_detail_list += hcp.column_type_list

                dataset_scehma_record = DatasetSchemaRecord(
                    table['name'], json.dumps(schema_json),
                    json.dumps(prop_json), json.dumps(flds),
                    "hive:///%s/%s" % (one_db_info['database'], table['name']),
                    'Hive', '', (table[TableInfo.create_time] if table.has_key(
                        TableInfo.create_time) else None),
                    (table["lastAlterTime"])
                    if table.has_key("lastAlterTime") else None)
                schema_file_writer.append(dataset_scehma_record)

                for fields in field_detail_list:
                    field_record = DatasetFieldRecord(fields)
                    field_file_writer.append(field_record)

            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        schema_file_writer.close()
        field_file_writer.close()
Ejemplo n.º 3
0
    def transform(self, input, hive_metadata, hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        f_json = open(input)
        all_data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                if TableInfo.view_expended_text in prop_json:
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`', '')
                    array = HiveViewDependency.getViewDependency(text)
                    l = []
                    for a in array:
                        l.append(a)
                    prop_json['view_depends_on'] = l

                # process either schema
                flds = {}
                field_detail_list = []
                if TableInfo.schema_literal in table and table[
                        TableInfo.schema_literal] is not None:
                    sort_id = 0
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                    except ValueError:
                        self.logger.error("Schema json error for table : \n" +
                                          str(table))
                    schema_json = schema_data

                    # process each field
                    for field in schema_data['fields']:
                        field_name = field['name']
                        type = field['type']  # could be a list
                        default_value = field[
                            'default'] if 'default' in field else None
                        doc = field['doc'] if 'doc' in field else None

                        attributes_json = json.loads(
                            field['attributes_json']
                        ) if 'attributes_json' in field else None
                        pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = None
                        if attributes_json:
                            pk = attributes_json[
                                'pk'] if 'pk' in attributes_json else None
                            delta = attributes_json[
                                'delta'] if 'delta' in attributes_json else None
                            is_nullable = attributes_json[
                                'nullable'] if 'nullable' in attributes_json else None
                            inside_type = attributes_json[
                                'type'] if 'type' in attributes_json else None
                            format = attributes_json[
                                'format'] if 'format' in attributes_json else None

                        flds[field_name] = {'type': type}
                        # String urn, Integer sortId, Integer parentSortId, String parentPath, String fieldName,
                        #String dataType, String isNullable, String defaultValue, Integer dataSize, String namespace, String description
                        sort_id += 1
                        field_detail_list.append([
                            "hive:///%s/%s" %
                            (one_db_info['database'], table['name']),
                            str(sort_id), '0', None, field_name, '', type,
                            data_size, None, None, is_nullable, is_indexed,
                            is_partitioned, default_value, None,
                            json.dumps(attributes_json)
                        ])
                elif TableInfo.field_list in table:
                    schema_json = {
                        'type': 'record',
                        'name': table['name'],
                        'fields': table[TableInfo.field_list]
                    }  # construct a schema for data came from COLUMN_V2
                    for field in table[TableInfo.field_list]:
                        field_name = field['ColumnName']
                        type = field['TypeName']
                        # ColumnName, IntegerIndex, TypeName, Comment
                        flds[field_name] = {'type': type}
                        pk = delta = is_nullable = is_indexed = is_partitioned = inside_type = format = data_size = default_value = None  # TODO ingest
                        field_detail_list.append([
                            "hive:///%s/%s" %
                            (one_db_info['database'], table['name']),
                            field['IntegerIndex'], '0', None, field_name, '',
                            field['TypeName'], None, None, None, is_nullable,
                            is_indexed, is_partitioned, default_value, None,
                            None
                        ])

                dataset_scehma_record = DatasetSchemaRecord(
                    table['name'], json.dumps(schema_json),
                    json.dumps(prop_json), json.dumps(flds),
                    "hive:///%s/%s" % (one_db_info['database'], table['name']),
                    'Hive', '', (table[TableInfo.create_time] if table.has_key(
                        TableInfo.create_time) else None),
                    (table["lastAlterTime"])
                    if table.has_key("lastAlterTime") else None)
                schema_file_writer.append(dataset_scehma_record)

                for fields in field_detail_list:
                    field_record = DatasetFieldRecord(fields)
                    field_file_writer.append(field_record)

            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        schema_file_writer.close()
        field_file_writer.close()
Ejemplo n.º 4
0
    def transform(self, input, hive_instance, hive_metadata,
                  hive_field_metadata):
        """
    convert from json to csv
    :param input: input json file
    :param hive_metadata: output data file for hive table metadata
    :param hive_field_metadata: output data file for hive field metadata
    :return:
    """
        all_data = []
        with open(input) as input_file:
            for line in input_file:
                all_data.append(json.loads(line))

        dataset_idx = -1

        instance_file_writer = FileWriter(hive_instance)
        schema_file_writer = FileWriter(hive_metadata)
        field_file_writer = FileWriter(hive_field_metadata)

        lineageInfo = LineageInfo()
        depends_sql = """
      SELECT d.NAME DB_NAME, case when t.TBL_NAME regexp '_[0-9]+_[0-9]+_[0-9]+$'
          then concat(substring(t.TBL_NAME, 1, length(t.TBL_NAME) - length(substring_index(t.TBL_NAME, '_', -3)) - 1),'_{version}')
        else t.TBL_NAME
        end dataset_name,
        concat('/', d.NAME, '/', t.TBL_NAME) object_name,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end object_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and d.NAME not like 'dalitest%' and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'View'
        else
            case when LOCATE('view', LOWER(t.TBL_TYPE)) > 0 then 'View'
          when LOCATE('index', LOWER(t.TBL_TYPE)) > 0 then 'Index'
            else 'Table'
          end
        end object_sub_type,
        case when (d.NAME like '%\_mp' or d.NAME like '%\_mp\_versioned') and t.TBL_TYPE = 'VIRTUAL_VIEW'
          then 'dalids'
        else 'hive'
        end prefix
      FROM TBLS t JOIN DBS d on t.DB_ID = d.DB_ID
      WHERE d.NAME = '{db_name}' and t.TBL_NAME = '{table_name}'
      """

        # one db info : 'type', 'database', 'tables'
        # one table info : required : 'name' , 'type', 'serializationFormat' ,'createTime', 'DB_ID', 'TBL_ID', 'SD_ID'
        #                  optional : 'schemaLiteral', 'schemaUrl', 'fieldDelimiter', 'fieldList'
        for one_db_info in all_data:
            i = 0
            for table in one_db_info['tables']:
                i += 1
                schema_json = {}
                prop_json = {}  # set the prop json

                for prop_name in TableInfo.optional_prop:
                    if prop_name in table and table[prop_name] is not None:
                        prop_json[prop_name] = table[prop_name]

                view_expanded_text = ''

                if TableInfo.view_expended_text in prop_json:
                    view_expanded_text = prop_json[
                        TableInfo.view_expended_text]
                    text = prop_json[TableInfo.view_expended_text].replace(
                        '`',
                        '')  # this will be fixed after switching to Hive AST
                    array = []
                    try:
                        array = HiveViewDependency.getViewDependency(text)
                    except:
                        self.logger.error(
                            "HiveViewDependency.getViewDependency(%s) failed!"
                            % (table['name']))

                    l = []
                    for a in array:
                        l.append(a)
                        names = str(a).split('.')
                        if names and len(names) >= 2:
                            db_name = names[0].lower()
                            table_name = names[1].lower()
                            if db_name and table_name:
                                self.curs.execute(
                                    depends_sql.format(db_name=db_name,
                                                       table_name=table_name,
                                                       version='{version}'))
                                rows = self.curs.fetchall()
                                self.conn_hms.commit()
                                if rows and len(rows) > 0:
                                    for row_index, row_value in enumerate(
                                            rows):
                                        dependent_record = HiveDependencyInstanceRecord(
                                            one_db_info['type'], table['type'],
                                            "/%s/%s" %
                                            (one_db_info['database'],
                                             table['name']), 'dalids:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name']
                                            if one_db_info['type'].lower()
                                            == 'dalids' else 'hive:///' +
                                            one_db_info['database'] + '/' +
                                            table['dataset_name'],
                                            'depends on', 'Y', row_value[3],
                                            row_value[4], row_value[2],
                                            row_value[5] + ':///' +
                                            row_value[0] + '/' + row_value[1],
                                            '')
                                        self.instance_writer.append(
                                            dependent_record)
                    prop_json['view_depends_on'] = l
                    self.instance_writer.flush()

                # process either schema
                flds = {}
                field_detail_list = []

                if TableInfo.schema_literal in table and \
                   table[TableInfo.schema_literal] is not None and \
                   table[TableInfo.schema_literal].startswith('{'):
                    sort_id = 0
                    urn = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    self.logger.info("Getting schema literal for: %s" % (urn))
                    try:
                        schema_data = json.loads(
                            table[TableInfo.schema_literal])
                        schema_json = schema_data
                        acp = AvroColumnParser(schema_data, urn=urn)
                        result = acp.get_column_list_result()
                        field_detail_list += result
                    except ValueError:
                        self.logger.error(
                            "Schema Literal JSON error for table: " +
                            str(table))

                elif TableInfo.field_list in table:
                    # Convert to avro
                    uri = "hive:///%s/%s" % (one_db_info['database'],
                                             table['dataset_name'])
                    if one_db_info['type'].lower() == 'dalids':
                        uri = "dalids:///%s/%s" % (one_db_info['database'],
                                                   table['dataset_name'])
                    else:
                        uri = "hive:///%s/%s" % (one_db_info['database'],
                                                 table['dataset_name'])
                    self.logger.info("Getting column definition for: %s" %
                                     (uri))
                    try:
                        hcp = HiveColumnParser(table, urn=uri)
                        schema_json = {
                            'fields': hcp.column_type_dict['fields'],
                            'type': 'record',
                            'name': table['name'],
                            'uri': uri
                        }
                        field_detail_list += hcp.column_type_list
                    except:
                        self.logger.error("HiveColumnParser(%s) failed!" %
                                          (uri))
                        schema_json = {
                            'fields': {},
                            'type': 'record',
                            'name': table['name'],
                            'uri': uri
                        }

                if one_db_info['type'].lower() == 'dalids':
                    dataset_urn = "dalids:///%s/%s" % (one_db_info['database'],
                                                       table['dataset_name'])
                else:
                    dataset_urn = "hive:///%s/%s" % (one_db_info['database'],
                                                     table['dataset_name'])

                dataset_instance_record = DatasetInstanceRecord(
                    'dalids:///' + one_db_info['database'] + '/' +
                    table['name'] if one_db_info['type'].lower() == 'dalids'
                    else 'hive:///' + one_db_info['database'] + '/' +
                    table['name'], 'grid', '', '', '*', 0,
                    table['native_name'], table['logical_name'],
                    table['version'], table['create_time'],
                    json.dumps(schema_json), json.dumps(view_expanded_text),
                    dataset_urn)
                instance_file_writer.append(dataset_instance_record)

                if dataset_urn not in self.dataset_dict:
                    dataset_scehma_record = DatasetSchemaRecord(
                        table['dataset_name'], json.dumps(schema_json),
                        json.dumps(prop_json), json.dumps(flds), dataset_urn,
                        'Hive', one_db_info['type'], table['type'], '',
                        table.get(TableInfo.create_time),
                        (int(table.get(TableInfo.source_modified_time, "0"))))
                    schema_file_writer.append(dataset_scehma_record)

                    dataset_idx += 1
                    self.dataset_dict[dataset_urn] = dataset_idx

                    for fields in field_detail_list:
                        field_record = DatasetFieldRecord(fields)
                        field_file_writer.append(field_record)

            instance_file_writer.flush()
            schema_file_writer.flush()
            field_file_writer.flush()
            self.logger.info("%20s contains %6d tables" %
                             (one_db_info['database'], i))

        instance_file_writer.close()
        schema_file_writer.close()
        field_file_writer.close()
    def transform(self, input, td_metadata, td_field_metadata):
        '''
    convert from json to csv
    :param input: input json file
    :param td_metadata: output data file for teradata metadata
    :param td_field_metadata: output data file for teradata field metadata
    :return:
    '''
        f_json = open(input)
        data = json.load(f_json)
        f_json.close()

        schema_file_writer = FileWriter(td_metadata)
        field_file_writer = FileWriter(td_field_metadata)

        for d in data:
            i = 0
            for k in d.keys():
                if k not in ['tables', 'views']:
                    continue
                self.logger.info("%s %4d %s" %
                                 (datetime.datetime.now().strftime("%H:%M:%S"),
                                  len(d[k]), k))
                for t in d[k]:
                    self.logger.info("%4d %s" % (i, t['name']))
                    if t['name'] == 'HDFStoTD_2464_ERR_1':
                        continue
                    i += 1
                    output = {}
                    prop_json = {}
                    output['name'] = t['name']
                    output['original_name'] = t['original_name']

                    prop_json["createTime"] = t["createTime"] if t.has_key(
                        "createTime") else None
                    prop_json[
                        "lastAlterTime"] = t["lastAlterTime"] if t.has_key(
                            "lastAlterTime") else None
                    prop_json[
                        "lastAccessTime"] = t["lastAccessTime"] if t.has_key(
                            "lastAccessTime") else None
                    prop_json["accessCount"] = t["accessCount"] if t.has_key(
                        "accessCount") else None
                    prop_json["sizeInMbytes"] = t["sizeInMbytes"] if t.has_key(
                        "sizeInMbytes") else None
                    if "type" in t:
                        prop_json["storage_type"] = t["type"]
                    if "partition" in t:
                        prop_json["partition"] = t["partition"]
                    if "partitions" in t:
                        prop_json["partitions"] = t["partitions"]
                    if "hashKey" in t:
                        prop_json["hashKey"] = t["hashKey"]
                    if "indices" in t:
                        prop_json["indices"] = t["indices"]
                    if "referenceTables" in t:
                        prop_json["referenceTables"] = t["referenceTables"]
                    if "viewSqlText" in t:
                        prop_json["viewSqlText"] = t["viewSqlText"]

                    output['fields'] = []
                    flds = {}
                    field_detail_list = []
                    sort_id = 0
                    for c in t['columns']:
                        # output['fields'].append(
                        #                    { 'name' : t['name'].encode('latin-1'),
                        #                      'type' : None if c['data_type'] is None else c['data_type'].encode('latin-1'),
                        #                      'attributes_json' : c}
                        #                output['fields'][c['name'].encode('latin-1')].append({ "doc" : "", "type" : [None if c['data_type'] is None else c['data_type'].encode('latin-1')]})
                        sort_id += 1
                        output['fields'].append({
                            "name":
                            c['name'],
                            "doc":
                            '',
                            "type":
                            c['dataType'] if c['dataType'] else None,
                            "nullable":
                            c['nullable'],
                            "maxByteLength":
                            c['maxByteLength'],
                            "format":
                            c['columnFormat']
                            if c.has_key('columnFormat') else None,
                            "accessCount":
                            c['accessCount']
                            if c.has_key('accessCount') else None,
                            "lastAccessTime":
                            c['lastAccessTime']
                            if c.has_key("lastAccessTime") else None
                        })

                        flds[c['name']] = {
                            'type': c['dataType'],
                            "maxByteLength": c['maxByteLength']
                        }

                        field_detail_list.append([
                            "teradata:///%s/%s" %
                            (d['database'], output['name']),
                            str(sort_id), '0', '', c['name'], '',
                            c['dataType'] if 'dataType' in c
                            and c['dataType'] is not None else '',
                            str(c['maxByteLength'])
                            if 'maxByteLength' in c else '0',
                            str(c['precision']) if 'precision' in c
                            and c['precision'] is not None else '',
                            str(c['scale'])
                            if 'scale' in c and c['scale'] is not None else '',
                            c['nullable'] if 'nullable' in c
                            and c['nullable'] is not None else 'Y', '', '', '',
                            '', '', '', ''
                        ])

                    dataset_scehma_record = DatasetSchemaRecord(
                        output['name'], json.dumps(output),
                        json.dumps(prop_json), json.dumps(flds),
                        "teradata:///%s/%s" % (d['database'], output['name']),
                        'Teradata', output['original_name'],
                        (self.convert_timestamp(t["createTime"])
                         if t.has_key("createTime") else None),
                        (self.convert_timestamp(t["lastAlterTime"])
                         if t.has_key("lastAlterTime") else None))
                    schema_file_writer.append(dataset_scehma_record)

                    for fields in field_detail_list:
                        field_record = DatasetFieldRecord(fields)
                        field_file_writer.append(field_record)

                schema_file_writer.flush()
                field_file_writer.flush()
                self.logger.info("%20s contains %6d %s" %
                                 (d['database'], i, k))

        schema_file_writer.close()
        field_file_writer.close()
Ejemplo n.º 6
0
    def transform(self, raw_metadata, metadata_output, field_metadata_output):

        # sys.setdefaultencoding("UTF-8")

        input_json_file = open(raw_metadata, 'r')
        schema_file_writer = FileWriter(metadata_output)
        field_file_writer = FileWriter(field_metadata_output)
        i = 0
        self.sort_id = 0
        o_urn = ''
        p = ''

        def fields_json_to_csv(output_list_, parent_field_path, field_list_):
            # string, list, int, optional int
            self.sort_id
            parent_field_path
            parent_id = self.sort_id

            for f in field_list_:
                self.sort_id += 1

                o_field_name = f['name']
                o_field_data_type = ''
                o_field_data_size = None
                o_field_nullable = 'N'
                o_field_default = ''
                o_field_namespace = ''
                o_field_doc = ''
                effective_type_index_in_type = -1

                if f.has_key('namespace'):
                    o_field_namespace = f['namespace']

                if f.has_key('default') and type(f['default']) != None:
                    o_field_default = f['default']

                if not f.has_key('type'):
                    o_field_data_type = None
                elif type(f['type']) == list:
                    i = effective_type_index = -1
                    for data_type in f['type']:
                        i += 1  # current index
                        if type(data_type) is None or (data_type == 'null'):
                            o_field_nullable = 'Y'
                        elif type(data_type) == dict:
                            o_field_data_type = data_type['type']
                            effective_type_index_in_type = i

                            if data_type.has_key('namespace'):
                                o_field_namespace = data_type['namespace']
                            elif data_type.has_key('name'):
                                o_field_namespace = data_type['name']

                            if data_type.has_key('size'):
                                o_field_data_size = data_type['size']
                            else:
                                o_field_data_size = None

                        else:
                            o_field_data_type = data_type
                            effective_type_index_in_type = i
                elif type(f['type']) == dict:
                    o_field_data_type = f['type']['type']
                else:
                    o_field_data_type = f['type']
                    if f.has_key('attributes') and f['attributes'].has_key(
                            'nullable'):
                        o_field_nullable = 'Y' if f['attributes'][
                            'nullable'] else 'N'
                    if f.has_key('attributes') and f['attributes'].has_key(
                            'size'):
                        o_field_data_size = f['attributes']['size']

                if f.has_key('doc'):
                    if len(f['doc']) == 0 and f.has_key('attributes'):
                        o_field_doc = json.dumps(f['attributes'])
                    else:
                        o_field_doc = f['doc']
                elif f.has_key('comment'):
                    o_field_doc = f['comment']

                output_list_.append([
                    o_urn, self.sort_id, parent_id, parent_field_path,
                    o_field_name, o_field_data_type, o_field_nullable,
                    o_field_default, o_field_data_size, o_field_namespace,
                    o_field_doc.replace("\n", ' ')
                    if o_field_doc is not None else None
                ])

                # check if this field is a nested record
                if type(f['type']) == dict and f['type'].has_key('fields'):
                    current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                    fields_json_to_csv(output_list_, current_field_path,
                                       f['type']['fields'])
                elif type(f['type']) == dict and f['type'].has_key(
                        'items') and type(
                            f['type']['items']
                        ) == dict and f['type']['items'].has_key('fields'):
                    current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                    fields_json_to_csv(output_list_, current_field_path,
                                       f['type']['items']['fields'])

                if effective_type_index_in_type >= 0 and type(
                        f['type'][effective_type_index_in_type]) == dict:
                    if f['type'][effective_type_index_in_type].has_key(
                            'items') and type(
                                f['type'][effective_type_index_in_type]
                                ['items']) == list:

                        for item in f['type'][effective_type_index_in_type][
                                'items']:
                            if type(item) == dict and item.has_key('fields'):
                                current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                                fields_json_to_csv(output_list_,
                                                   current_field_path,
                                                   item['fields'])
                    elif f['type'][effective_type_index_in_type].has_key(
                            'items') and f['type'][
                                effective_type_index_in_type]['items'].has_key(
                                    'fields'):
                        # type: [ null, { type: array, items: { name: xxx, type: record, fields: [] } } ]
                        current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                        fields_json_to_csv(
                            output_list_, current_field_path, f['type']
                            [effective_type_index_in_type]['items']['fields'])
                    elif f['type'][effective_type_index_in_type].has_key(
                            'fields'):
                        # if f['type'][effective_type_index_in_type].has_key('namespace'):
                        # o_field_namespace = f['type'][effective_type_index_in_type]['namespace']
                        current_field_path = o_field_name if parent_field_path == '' else parent_field_path + '.' + o_field_name
                        fields_json_to_csv(
                            output_list_, current_field_path,
                            f['type'][effective_type_index_in_type]['fields'])

                        # End of function

        for line in input_json_file:
            try:
                j = json.loads(line)
            except:
                self.logger.error("    Invalid JSON:\n%s" % line)
                continue

            i += 1
            o_field_list_ = []
            parent_field_path = ''
            self.sort_id = 0

            if not (j.has_key('attributes_json') or j.has_key('attributes')):
                o_properties = {"doc": null}
            else:
                o_properties = {}
                if j.has_key('attributes_json'):
                    o_properties = json.loads(j['attributes_json'])
                    del j['attributes_json']
                if j.has_key('attributes'):
                    o_properties = dict(j['attributes'].items() +
                                        o_properties.items())
                    del j['attributes']

            if j.has_key('uri'):
                o_urn = j['uri']
            elif o_properties.has_key('uri'):
                o_urn = o_properties['uri']
            else:
                self.logger.info('*** Warning: "uri" is not found in %s' %
                                 j['name'])
                o_urn = ''

            if o_urn.find('hdfs://') == 0:
                o_name = o_urn[o_urn.rfind('/') + 1:]
            elif o_properties.has_key('table_name'):
                o_name = o_properties['table_name']
            elif j.has_key('name') and j['name'][0:5] != 'TUPLE':
                o_name = j['name']
            else:
                o_name = o_urn[o_urn.rfind('/') + 1:]

            if j.has_key('id') or not j.has_key('fields'):  # esWritable schema
                o_fields = {}
                for k in j:
                    if not (k == 'uri' or k == 'attributes' or k == 'doc'):
                        if type(j[k]) == list:
                            o_fields[k] = {
                                "name": k,
                                "type": 'list',
                                "doc": str(j[k])
                            }
                        elif type(j[k]) == dict:
                            o_fields[k] = {
                                "name": k,
                                "type": 'dict',
                                "doc": str(j[k])
                            }
                        else:
                            o_fields[k] = {
                                "name": k,
                                "type": j[k],
                                "doc": None
                            }

                        self.sort_id += 1
                        o_field_list_.append([
                            o_urn, self.sort_id, 0, '', k, o_fields[k]['type'],
                            '', '', '', o_fields[k]['doc'].replace("\n", ' ')
                            if o_fields[k]['doc'] is not None else None
                        ])

            elif j.has_key('fields'):
                o_fields = {}
                for f in j['fields']:
                    o_field_name = f['name']
                    o_fields[o_field_name] = dict(f)  # for schema output
                    if f.has_key('attributes_json'):
                        f['attributes'] = json.loads(f['attributes_json'])
                        del f['attributes_json']

                fields_json_to_csv(o_field_list_, '', j['fields'])

            else:
                o_fields = {"doc": None}

            if j.has_key('attributes') and not o_properties.has_key('source'):
                o_properties['source'] = j['attributes']['source']

            if o_urn.startswith(
                    'hdfs:///') and self.file_regex_source_map is not None:
                o_source = self.get_source(o_urn[7:])
            else:
                self.logger.warn(
                    "property : " + Constant.HDFS_FILE_SOURCE_MAP_KEY +
                    " is None, will use default source for all dataset")
                o_source = 'Hdfs'

            self.logger.info(
                "%4i (%6i): %4i fields, %4i total fields(including nested) found in [%s]@%s with source %s"
                % (i, len(j), len(o_fields), len(o_field_list_), o_name, o_urn,
                   o_source))

            dataset_schema_record = DatasetSchemaRecord(
                o_name, json.dumps(j, sort_keys=True),
                json.dumps(o_properties, sort_keys=True), json.dumps(o_fields),
                o_urn, o_source, None, None, None)
            schema_file_writer.append(dataset_schema_record)

            for fields in o_field_list_:
                field_record = DatasetFieldRecord(fields)
                field_file_writer.append(field_record)

        schema_file_writer.close()
        field_file_writer.close()
        input_json_file.close()
    def transform(self, raw_metadata, metadata_output, field_metadata_output):

        # sys.setdefaultencoding("UTF-8")

        input_json_file = open(raw_metadata, 'r')
        schema_file_writer = FileWriter(metadata_output)
        field_file_writer = FileWriter(field_metadata_output)
        i = 0
        self.sort_id = 0
        o_urn = ''
        p = ''

        for line in input_json_file:
            try:
                j = json.loads(line)
            except:
                self.logger.error("    Invalid JSON:\n%s" % line)
                continue

            i += 1
            o_field_list_ = []
            parent_field_path = ''
            self.sort_id = 0

            if not (j.has_key('attributes_json') or j.has_key('attributes')):
                o_properties = {"doc": null}
            else:
                o_properties = {}
                if j.has_key('attributes_json'):
                    o_properties = json.loads(j['attributes_json'])
                    del j['attributes_json']
                if j.has_key('attributes'):
                    o_properties = dict(j['attributes'].items() +
                                        o_properties.items())
                    del j['attributes']

            if j.has_key('uri'):
                o_urn = j['uri']
            elif o_properties.has_key('uri'):
                o_urn = o_properties['uri']
            else:
                self.logger.info('*** Warning: "uri" is not found in %s' %
                                 j['name'])
                o_urn = ''

            if o_urn.find('hdfs://') == 0:
                o_name = o_urn[o_urn.rfind('/') + 1:]
            elif o_properties.has_key('table_name'):
                o_name = o_properties['table_name']
            elif j.has_key('name') and j['name'][0:5] != 'TUPLE':
                o_name = j['name']
            else:
                o_name = o_urn[o_urn.rfind('/') + 1:]

            if j.has_key('id') or not j.has_key('fields'):  # esWritable schema
                o_fields = {}
                for k in j:
                    if not (k == 'uri' or k == 'attributes' or k == 'doc'):
                        if type(j[k]) == list:
                            o_fields[k] = {
                                "name": k,
                                "type": 'list',
                                "doc": str(j[k])
                            }
                        elif type(j[k]) == dict:
                            o_fields[k] = {
                                "name": k,
                                "type": 'dict',
                                "doc": str(j[k])
                            }
                        else:
                            o_fields[k] = {
                                "name": k,
                                "type": j[k],
                                "doc": None
                            }

                        self.sort_id += 1
                        o_field_list_.append([
                            o_urn, self.sort_id, 0, '', k, o_fields[k]['type'],
                            '', '', '', o_fields[k]['doc'].replace("\n", ' ')
                            if o_fields[k]['doc'] is not None else None
                        ])

            elif j.has_key('fields'):
                o_fields = {}
                for f in j['fields']:
                    o_field_name = f['name']
                    o_fields[o_field_name] = dict(f)  # for schema output
                    if f.has_key('attributes_json'):
                        f['attributes'] = json.loads(f['attributes_json'])
                        del f['attributes_json']

                acp = AvroColumnParser(j, o_urn)
                o_field_list_ += acp.get_column_list_result()

            else:
                o_fields = {"doc": None}

            if j.has_key('attributes') and not o_properties.has_key('source'):
                o_properties['source'] = j['attributes']['source']

            if o_urn.startswith(
                    'hdfs:///') and self.file_regex_source_map is not None:
                o_source = self.get_source(o_urn[7:])
            else:
                self.logger.warn(
                    "property : " + Constant.HDFS_FILE_SOURCE_MAP_KEY +
                    " is None, will use default source for all dataset")
                o_source = 'Hdfs'

            self.logger.info(
                "%4i (%6i): %4i fields, %4i total fields(including nested) found in [%s]@%s with source %s"
                % (i, len(j), len(o_fields), len(o_field_list_), o_name, o_urn,
                   o_source))

            dataset_schema_record = DatasetSchemaRecord(
                o_name, json.dumps(j, sort_keys=True),
                json.dumps(o_properties, sort_keys=True), json.dumps(o_fields),
                o_urn, o_source, None, None, None)
            schema_file_writer.append(dataset_schema_record)

            for fields in o_field_list_:
                field_record = DatasetFieldRecord(fields)
                field_file_writer.append(field_record)

        schema_file_writer.close()
        field_file_writer.close()
        input_json_file.close()