def main():
    # arguments
    from_s3 = 'from-s3'
    from_jdbc = 'from-jdbc'
    parser = argparse.ArgumentParser(prog=sys.argv[0])
    parser.add_argument('-m', '--mode', required=True, choices=[from_s3, from_jdbc], help='Choose to migrate metastore either from JDBC or from S3')
    parser.add_argument('-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection')
    parser.add_argument('-R', '--region', required=False, help='AWS region of target Glue DataCatalog, default to "us-east-1"')
    parser.add_argument('-d', '--database-prefix', required=False, help='Optional prefix for database names in Glue DataCatalog')
    parser.add_argument('-t', '--table-prefix', required=False, help='Optional prefix for table name in Glue DataCatalog')
    parser.add_argument('-D', '--database-input-path', required=False, help='An S3 path containing json files of metastore database entities')
    parser.add_argument('-T', '--table-input-path', required=False, help='An S3 path containing json files of metastore table entities')
    parser.add_argument('-P', '--partition-input-path', required=False, help='An S3 path containing json files of metastore partition entities')

    options = get_options(parser, sys.argv)
    if options['mode'] == from_s3:
        validate_options_in_mode(
            options=options, mode=from_s3,
            required_options=['database_input_path', 'table_input_path', 'partition_input_path'],
            not_allowed_options=['database_prefix', 'table_prefix']
        )
    elif options['mode'] == from_jdbc:
        validate_options_in_mode(
            options=options, mode=from_jdbc,
            required_options=['connection_name'],
            not_allowed_options=['database_input_path', 'table_input_path', 'partition_input_path']
        )
    else:
        raise AssertionError('unknown mode ' + options['mode'])

    validate_aws_regions(options['region'])

    # spark env
    (conf, sc, sql_context) = get_spark_env()
    glue_context = GlueContext(sc)

    # launch job
    if options['mode'] == from_s3:
        metastore_import_from_s3(
            sql_context=sql_context,
            glue_context=glue_context,
            db_input_dir=options['database_input_path'],
            tbl_input_dir=options['table_input_path'],
            parts_input_dir=options['partition_input_path'],
            datacatalog_name='datacatalog',
            region=options.get('region') or 'us-east-1'
        )
    elif options['mode'] == from_jdbc:
        glue_context.extract_jdbc_conf(options['connection_name'])
        metastore_full_migration(
            sc=sc,
            sql_context=sql_context,
            glue_context=glue_context,
            connection=glue_context.extract_jdbc_conf(options['connection_name']),
            db_prefix=options.get('database_prefix') or '',
            table_prefix=options.get('table_prefix') or '',
            datacatalog_name='datacatalog',
            region=options.get('region') or 'us-east-1'
        )
Beispiel #2
0
def main():
    to_s3 = 'to-s3'
    to_jdbc = 'to-jdbc'
    parser = argparse.ArgumentParser(prog=sys.argv[0])
    parser.add_argument(
        '-m',
        '--mode',
        required=True,
        choices=[to_s3, to_jdbc],
        help='Choose to migrate from datacatalog to s3 or to metastore')
    parser.add_argument(
        '--database-names',
        required=True,
        help=
        'Semicolon-separated list of names of database in Datacatalog to export'
    )
    parser.add_argument('-o',
                        '--output-path',
                        required=False,
                        help='Output path, either local directory or S3 path')
    parser.add_argument(
        '-c',
        '--connection-name',
        required=False,
        help='Glue Connection name for Hive metastore JDBC connection')
    parser.add_argument(
        '-R',
        '--region',
        required=False,
        help='AWS region of source Glue DataCatalog, default to "us-east-1"')

    options = get_options(parser, sys.argv)
    if options['mode'] == to_s3:
        validate_options_in_mode(options=options,
                                 mode=to_s3,
                                 required_options=['output_path'],
                                 not_allowed_options=['connection_name'])
    elif options['mode'] == to_jdbc:
        validate_options_in_mode(options=options,
                                 mode=to_jdbc,
                                 required_options=['connection_name'],
                                 not_allowed_options=['output_path'])
    else:
        raise AssertionError('unknown mode ' + options['mode'])

    validate_aws_regions(options['region'])

    # spark env
    (conf, sc, sql_context) = get_spark_env()
    glue_context = GlueContext(sc)

    # extract from datacatalog reader
    database_arr = options['database_names'].split(';')

    (databases, tables,
     partitions) = read_databases_from_catalog(sql_context=sql_context,
                                               glue_context=glue_context,
                                               datacatalog_name='datacatalog',
                                               database_arr=database_arr,
                                               region=options.get('region')
                                               or 'us-east-1')

    if options['mode'] == to_s3:
        output_path = get_output_dir(options['output_path'])
        datacatalog_migrate_to_s3(databases=databases,
                                  tables=tables,
                                  partitions=partitions,
                                  output_path=output_path)
    elif options['mode'] == to_jdbc:
        connection_name = options['connection_name']
        datacatalog_migrate_to_hive_metastore(
            sc=sc,
            sql_context=sql_context,
            databases=databases,
            tables=tables,
            partitions=partitions,
            connection=glue_context.extract_jdbc_conf(connection_name))
def main():
    # arguments
    from_s3 = 'from-s3'
    from_jdbc = 'from-jdbc'
    parser = argparse.ArgumentParser(prog=sys.argv[0])
    parser.add_argument(
        '-m',
        '--mode',
        required=True,
        choices=[from_s3, from_jdbc],
        help='Choose to migrate metastore either from JDBC or from S3')
    parser.add_argument(
        '-c',
        '--connection-name',
        required=False,
        help='Glue Connection name for Hive metastore JDBC connection')
    parser.add_argument(
        '-d',
        '--database-prefix',
        required=False,
        help='Optional prefix for database names in Glue DataCatalog')
    parser.add_argument(
        '-t',
        '--table-prefix',
        required=False,
        help='Optional prefix for table name in Glue DataCatalog')
    parser.add_argument(
        '-D',
        '--database-input-path',
        required=False,
        help='An S3 path containing json files of metastore database entities')
    parser.add_argument(
        '-T',
        '--table-input-path',
        required=False,
        help='An S3 path containing json files of metastore table entities')
    parser.add_argument(
        '-P',
        '--partition-input-path',
        required=False,
        help='An S3 path containing json files of metastore partition entities'
    )

    options = get_options(parser, sys.argv)
    if options['mode'] == from_s3:
        validate_options_in_mode(
            options=options,
            mode=from_s3,
            required_options=[
                'database_input_path', 'table_input_path',
                'partition_input_path'
            ],
            not_allowed_options=['database_prefix', 'table_prefix'])
    elif options['mode'] == from_jdbc:
        validate_options_in_mode(options=options,
                                 mode=from_jdbc,
                                 required_options=['connection_name'],
                                 not_allowed_options=[
                                     'database_input_path', 'table_input_path',
                                     'partition_input_path'
                                 ])
    else:
        raise AssertionError('unknown mode ' + options['mode'])

    # spark env
    (conf, sc, sql_context) = get_spark_env()
    glue_context = GlueContext(sc)

    # launch job
    if options['mode'] == from_s3:
        metastore_import_from_s3(
            sql_context=sql_context,
            glue_context=glue_context,
            db_input_dir=options['database_input_path'],
            tbl_input_dir=options['table_input_path'],
            parts_input_dir=options['partition_input_path'],
            datacatalog_name='datacatalog')
    elif options['mode'] == from_jdbc:
        glue_context.extract_jdbc_conf(options['connection_name'])
        metastore_full_migration(sc=sc,
                                 sql_context=sql_context,
                                 glue_context=glue_context,
                                 connection=glue_context.extract_jdbc_conf(
                                     options['connection_name']),
                                 db_prefix=options['database_prefix']
                                 if options.has_key('database_prefix') else "",
                                 table_prefix=options['table_prefix']
                                 if options.has_key('table_prefix') else "",
                                 datacatalog_name='datacatalog')
def main():
    to_s3 = 'to-s3'
    to_jdbc = 'to-jdbc'
    parser = argparse.ArgumentParser(prog=sys.argv[0])
    parser.add_argument(
        '-m',
        '--mode',
        required=True,
        choices=[to_s3, to_jdbc],
        help='Choose to migrate from datacatalog to s3 or to metastore')
    parser.add_argument(
        '--database-names',
        required=True,
        help=
        'Colon-separated list of names of database in Datacatalog to export')
    parser.add_argument('-o',
                        '--output-path',
                        required=False,
                        help='Output path, either local directory or S3 path')
    parser.add_argument(
        '-c',
        '--connection-name',
        required=False,
        help='Glue Connection name for Hive metastore JDBC connection')
    parser.add_argument(
        '-R',
        '--region',
        required=False,
        help='AWS region of source Glue DataCatalog, default to "us-east-1"')
    parser.add_argument(
        '-l',
        '--latest',
        required=False,
        action='store_true',
        help='Copy the export folder to a latest/ folder (overwriting)')

    options = get_options(parser, sys.argv)
    if options['mode'] == to_s3:
        validate_options_in_mode(options=options,
                                 mode=to_s3,
                                 required_options=['output_path'],
                                 not_allowed_options=['connection_name'])
    elif options['mode'] == to_jdbc:
        validate_options_in_mode(options=options,
                                 mode=to_jdbc,
                                 required_options=['connection_name'],
                                 not_allowed_options=['output_path'])
    else:
        raise AssertionError('unknown mode ' + options['mode'])

    validate_aws_regions(options['region'])
    client = boto3.client('glue', region_name=options['region'])
    # spark env
    (conf, sc, sql_context) = get_spark_env()
    glue_context = GlueContext(sc)
    # extract from datacatalog reader
    database_arr = options['database_names'].split(',')
    if database_arr[0] == 'ALL':
        # get the database names from glue
        resp = client.get_databases()
        if resp.get('DatabaseList'):
            database_arr = [db['Name'] for db in resp['DatabaseList']]
        else:
            # trying to add a default database if there is an error
            database_arr = ['default']

    (databases, tables,
     partitions) = read_databases_from_catalog(sql_context=sql_context,
                                               glue_context=glue_context,
                                               datacatalog_name='datacatalog',
                                               database_arr=database_arr,
                                               region=options.get('region')
                                               or 'us-east-1')

    if options['mode'] == to_s3:
        output_path = get_output_dir(options['output_path'])
        datacatalog_migrate_to_s3(databases=databases,
                                  tables=tables,
                                  partitions=partitions,
                                  output_path=output_path)
        if options['latest']:
            output_path = get_output_dir(options['output_path'], 'latest')
            datacatalog_migrate_to_s3(databases=databases,
                                      tables=tables,
                                      partitions=partitions,
                                      output_path=output_path)
    elif options['mode'] == to_jdbc:
        connection_name = options['connection_name']
        datacatalog_migrate_to_hive_metastore(
            sc=sc,
            sql_context=sql_context,
            databases=databases,
            tables=tables,
            partitions=partitions,
            connection=glue_context.extract_jdbc_conf(connection_name))
def main():
    to_s3 = 'to-s3'
    to_jdbc = 'to-jdbc'
    parser = argparse.ArgumentParser(prog=sys.argv[0])
    parser.add_argument('-m', '--mode', required=True, choices=[to_s3, to_jdbc], help='Choose to migrate from datacatalog to s3 or to metastore')
    parser.add_argument('--database-names', required=True, help='Semicolon-separated list of names of database in Datacatalog to export')
    parser.add_argument('-o', '--output-path', required=False, help='Output path, either local directory or S3 path')
    parser.add_argument('-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection')
    parser.add_argument('-R', '--region', required=False, help='AWS region of source Glue DataCatalog, default to "us-east-1"')

    options = get_options(parser, sys.argv)
    if options['mode'] == to_s3:
        validate_options_in_mode(
            options=options, mode=to_s3,
            required_options=['output_path'],
            not_allowed_options=['connection_name']
        )
    elif options['mode'] == to_jdbc:
        validate_options_in_mode(
            options=options, mode=to_jdbc,
            required_options=['connection_name'],
            not_allowed_options=['output_path']
        )
    else:
        raise AssertionError('unknown mode ' + options['mode'])

    validate_aws_regions(options['region'])

    # spark env
    (conf, sc, sql_context) = get_spark_env()
    glue_context = GlueContext(sc)

    # extract from datacatalog reader
    database_arr = options['database_names'].split(';')

    (databases, tables, partitions) = read_databases_from_catalog(
        sql_context=sql_context,
        glue_context=glue_context,
        datacatalog_name='datacatalog',
        database_arr=database_arr,
        region=options.get('region') or 'us-east-1'
    )

    if options['mode'] == to_s3:
        output_path = get_output_dir(options['output_path'])
        datacatalog_migrate_to_s3(
            databases=databases,
            tables=tables,
            partitions=partitions,
            output_path=output_path
        )
    elif options['mode'] == to_jdbc:
        connection_name = options['connection_name']
        datacatalog_migrate_to_hive_metastore(
            sc=sc,
            sql_context=sql_context,
            databases=databases,
            tables=tables,
            partitions=partitions,
            connection=glue_context.extract_jdbc_conf(connection_name)
        )