def main(): # arguments from_s3 = 'from-s3' from_jdbc = 'from-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument('-m', '--mode', required=True, choices=[from_s3, from_jdbc], help='Choose to migrate metastore either from JDBC or from S3') parser.add_argument('-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument('-R', '--region', required=False, help='AWS region of target Glue DataCatalog, default to "us-east-1"') parser.add_argument('-d', '--database-prefix', required=False, help='Optional prefix for database names in Glue DataCatalog') parser.add_argument('-t', '--table-prefix', required=False, help='Optional prefix for table name in Glue DataCatalog') parser.add_argument('-D', '--database-input-path', required=False, help='An S3 path containing json files of metastore database entities') parser.add_argument('-T', '--table-input-path', required=False, help='An S3 path containing json files of metastore table entities') parser.add_argument('-P', '--partition-input-path', required=False, help='An S3 path containing json files of metastore partition entities') options = get_options(parser, sys.argv) if options['mode'] == from_s3: validate_options_in_mode( options=options, mode=from_s3, required_options=['database_input_path', 'table_input_path', 'partition_input_path'], not_allowed_options=['database_prefix', 'table_prefix'] ) elif options['mode'] == from_jdbc: validate_options_in_mode( options=options, mode=from_jdbc, required_options=['connection_name'], not_allowed_options=['database_input_path', 'table_input_path', 'partition_input_path'] ) else: raise AssertionError('unknown mode ' + options['mode']) validate_aws_regions(options['region']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # launch job if options['mode'] == from_s3: metastore_import_from_s3( sql_context=sql_context, glue_context=glue_context, db_input_dir=options['database_input_path'], tbl_input_dir=options['table_input_path'], parts_input_dir=options['partition_input_path'], datacatalog_name='datacatalog', region=options.get('region') or 'us-east-1' ) elif options['mode'] == from_jdbc: glue_context.extract_jdbc_conf(options['connection_name']) metastore_full_migration( sc=sc, sql_context=sql_context, glue_context=glue_context, connection=glue_context.extract_jdbc_conf(options['connection_name']), db_prefix=options.get('database_prefix') or '', table_prefix=options.get('table_prefix') or '', datacatalog_name='datacatalog', region=options.get('region') or 'us-east-1' )
def main(): to_s3 = 'to-s3' to_jdbc = 'to-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument( '-m', '--mode', required=True, choices=[to_s3, to_jdbc], help='Choose to migrate from datacatalog to s3 or to metastore') parser.add_argument( '--database-names', required=True, help= 'Semicolon-separated list of names of database in Datacatalog to export' ) parser.add_argument('-o', '--output-path', required=False, help='Output path, either local directory or S3 path') parser.add_argument( '-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument( '-R', '--region', required=False, help='AWS region of source Glue DataCatalog, default to "us-east-1"') options = get_options(parser, sys.argv) if options['mode'] == to_s3: validate_options_in_mode(options=options, mode=to_s3, required_options=['output_path'], not_allowed_options=['connection_name']) elif options['mode'] == to_jdbc: validate_options_in_mode(options=options, mode=to_jdbc, required_options=['connection_name'], not_allowed_options=['output_path']) else: raise AssertionError('unknown mode ' + options['mode']) validate_aws_regions(options['region']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # extract from datacatalog reader database_arr = options['database_names'].split(';') (databases, tables, partitions) = read_databases_from_catalog(sql_context=sql_context, glue_context=glue_context, datacatalog_name='datacatalog', database_arr=database_arr, region=options.get('region') or 'us-east-1') if options['mode'] == to_s3: output_path = get_output_dir(options['output_path']) datacatalog_migrate_to_s3(databases=databases, tables=tables, partitions=partitions, output_path=output_path) elif options['mode'] == to_jdbc: connection_name = options['connection_name'] datacatalog_migrate_to_hive_metastore( sc=sc, sql_context=sql_context, databases=databases, tables=tables, partitions=partitions, connection=glue_context.extract_jdbc_conf(connection_name))
def main(): # arguments from_s3 = 'from-s3' from_jdbc = 'from-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument( '-m', '--mode', required=True, choices=[from_s3, from_jdbc], help='Choose to migrate metastore either from JDBC or from S3') parser.add_argument( '-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument( '-d', '--database-prefix', required=False, help='Optional prefix for database names in Glue DataCatalog') parser.add_argument( '-t', '--table-prefix', required=False, help='Optional prefix for table name in Glue DataCatalog') parser.add_argument( '-D', '--database-input-path', required=False, help='An S3 path containing json files of metastore database entities') parser.add_argument( '-T', '--table-input-path', required=False, help='An S3 path containing json files of metastore table entities') parser.add_argument( '-P', '--partition-input-path', required=False, help='An S3 path containing json files of metastore partition entities' ) options = get_options(parser, sys.argv) if options['mode'] == from_s3: validate_options_in_mode( options=options, mode=from_s3, required_options=[ 'database_input_path', 'table_input_path', 'partition_input_path' ], not_allowed_options=['database_prefix', 'table_prefix']) elif options['mode'] == from_jdbc: validate_options_in_mode(options=options, mode=from_jdbc, required_options=['connection_name'], not_allowed_options=[ 'database_input_path', 'table_input_path', 'partition_input_path' ]) else: raise AssertionError('unknown mode ' + options['mode']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # launch job if options['mode'] == from_s3: metastore_import_from_s3( sql_context=sql_context, glue_context=glue_context, db_input_dir=options['database_input_path'], tbl_input_dir=options['table_input_path'], parts_input_dir=options['partition_input_path'], datacatalog_name='datacatalog') elif options['mode'] == from_jdbc: glue_context.extract_jdbc_conf(options['connection_name']) metastore_full_migration(sc=sc, sql_context=sql_context, glue_context=glue_context, connection=glue_context.extract_jdbc_conf( options['connection_name']), db_prefix=options['database_prefix'] if options.has_key('database_prefix') else "", table_prefix=options['table_prefix'] if options.has_key('table_prefix') else "", datacatalog_name='datacatalog')
def main(): to_s3 = 'to-s3' to_jdbc = 'to-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument( '-m', '--mode', required=True, choices=[to_s3, to_jdbc], help='Choose to migrate from datacatalog to s3 or to metastore') parser.add_argument( '--database-names', required=True, help= 'Colon-separated list of names of database in Datacatalog to export') parser.add_argument('-o', '--output-path', required=False, help='Output path, either local directory or S3 path') parser.add_argument( '-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument( '-R', '--region', required=False, help='AWS region of source Glue DataCatalog, default to "us-east-1"') parser.add_argument( '-l', '--latest', required=False, action='store_true', help='Copy the export folder to a latest/ folder (overwriting)') options = get_options(parser, sys.argv) if options['mode'] == to_s3: validate_options_in_mode(options=options, mode=to_s3, required_options=['output_path'], not_allowed_options=['connection_name']) elif options['mode'] == to_jdbc: validate_options_in_mode(options=options, mode=to_jdbc, required_options=['connection_name'], not_allowed_options=['output_path']) else: raise AssertionError('unknown mode ' + options['mode']) validate_aws_regions(options['region']) client = boto3.client('glue', region_name=options['region']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # extract from datacatalog reader database_arr = options['database_names'].split(',') if database_arr[0] == 'ALL': # get the database names from glue resp = client.get_databases() if resp.get('DatabaseList'): database_arr = [db['Name'] for db in resp['DatabaseList']] else: # trying to add a default database if there is an error database_arr = ['default'] (databases, tables, partitions) = read_databases_from_catalog(sql_context=sql_context, glue_context=glue_context, datacatalog_name='datacatalog', database_arr=database_arr, region=options.get('region') or 'us-east-1') if options['mode'] == to_s3: output_path = get_output_dir(options['output_path']) datacatalog_migrate_to_s3(databases=databases, tables=tables, partitions=partitions, output_path=output_path) if options['latest']: output_path = get_output_dir(options['output_path'], 'latest') datacatalog_migrate_to_s3(databases=databases, tables=tables, partitions=partitions, output_path=output_path) elif options['mode'] == to_jdbc: connection_name = options['connection_name'] datacatalog_migrate_to_hive_metastore( sc=sc, sql_context=sql_context, databases=databases, tables=tables, partitions=partitions, connection=glue_context.extract_jdbc_conf(connection_name))
def main(): to_s3 = 'to-s3' to_jdbc = 'to-jdbc' parser = argparse.ArgumentParser(prog=sys.argv[0]) parser.add_argument('-m', '--mode', required=True, choices=[to_s3, to_jdbc], help='Choose to migrate from datacatalog to s3 or to metastore') parser.add_argument('--database-names', required=True, help='Semicolon-separated list of names of database in Datacatalog to export') parser.add_argument('-o', '--output-path', required=False, help='Output path, either local directory or S3 path') parser.add_argument('-c', '--connection-name', required=False, help='Glue Connection name for Hive metastore JDBC connection') parser.add_argument('-R', '--region', required=False, help='AWS region of source Glue DataCatalog, default to "us-east-1"') options = get_options(parser, sys.argv) if options['mode'] == to_s3: validate_options_in_mode( options=options, mode=to_s3, required_options=['output_path'], not_allowed_options=['connection_name'] ) elif options['mode'] == to_jdbc: validate_options_in_mode( options=options, mode=to_jdbc, required_options=['connection_name'], not_allowed_options=['output_path'] ) else: raise AssertionError('unknown mode ' + options['mode']) validate_aws_regions(options['region']) # spark env (conf, sc, sql_context) = get_spark_env() glue_context = GlueContext(sc) # extract from datacatalog reader database_arr = options['database_names'].split(';') (databases, tables, partitions) = read_databases_from_catalog( sql_context=sql_context, glue_context=glue_context, datacatalog_name='datacatalog', database_arr=database_arr, region=options.get('region') or 'us-east-1' ) if options['mode'] == to_s3: output_path = get_output_dir(options['output_path']) datacatalog_migrate_to_s3( databases=databases, tables=tables, partitions=partitions, output_path=output_path ) elif options['mode'] == to_jdbc: connection_name = options['connection_name'] datacatalog_migrate_to_hive_metastore( sc=sc, sql_context=sql_context, databases=databases, tables=tables, partitions=partitions, connection=glue_context.extract_jdbc_conf(connection_name) )