def wait_for_data(wait_for_config, options): """Wait for data before kicking off hive jobs""" # Step 1 - read meta data. hive_mysql_connector.configure(options.hive_masternode, options.ssh_keyfile) if options.hive_init: hive_mysql_connector.run_hive_init() # Step 2 - wait for all the data partitions are available boto_util.initialize_creds_from_file() s3conn = boto.connect_s3() s3bucket = s3conn.get_bucket('ka-mapreduce') max_wait = datetime.timedelta(hours=options.max_wait) start = datetime.datetime.now() for d in wait_for_config: table = d['table_name'] table_location = hive_mysql_connector.get_table_location(table) for p in d['partitions']: partition_location = table_location + '/' + p #TODO(yunfang): abstract the following to wait_for_partition # for boto_util while True: if partition_available(s3bucket, partition_location): g_logger.info("%s is available" % (partition_location)) break if (datetime.datetime.now() - start) > max_wait: # Wait for a long time already. Can't wait any more g_logger.fatal("Wait for too long. " "Data is still not available." "Exiting...") sys.exit(1) # Wait for a minute to check again g_logger.info("Waiting for %s to be available... " % ( partition_location)) time.sleep(60)
if __name__ == '__main__': start_dt = datetime.datetime.now() options, args = parse_command_line_args() # Step 1 - read meta data. hive_masternode = args[0] hive_mysql_connector.configure(hive_masternode, options.ssh_keyfile) if options.hive_init: hive_mysql_connector.run_hive_init() table_name = args[1] print "Fetching table info..." table_location = hive_mysql_connector.get_table_location(table_name) if not table_location: raise Exception("Can't read info about %s in Hive master %s" % (hive_masternode, table_name)) if not table_location.startswith('s3://ka-mapreduce/'): raise Exception("Can only import from s3://ka-mapreduce for now") column_info = hive_mysql_connector.get_table_columns(table_name) target_db = args[2] target_collection = args[3] partition_cols = args[4:] # TODO(benkomalo): prompt/dry-run flags? # Step 2 - print locations print_locations(table_location, column_info, partition_cols,
start_dt = datetime.datetime.now() # Named arguments appear as properties on the options object # Unnamed arguments appears as elements in the args array options, args = parse_command_line_args() # Step 1 - read meta data. hive_masternode = args[0] # Generally, 'ka-hive' hive_mysql_connector.configure(hive_masternode, options.ssh_keyfile) if options.hive_init: hive_mysql_connector.run_hive_init() table_name = args[1] # Generally == target_collection = args[3] print "Fetching table info..." table_location = hive_mysql_connector.get_table_location(table_name) if not table_location: raise Exception("Can't read info about %s in Hive master %s" % (hive_masternode, table_name)) if not table_location.startswith('s3://ka-mapreduce/'): raise Exception("Can only import from s3://ka-mapreduce for now") column_info = hive_mysql_connector.get_table_columns(table_name) target_db = args[2] # Always 'report' target_collection = args[3] # Generally == table_name = args[1] partition_cols = args[4:] # Something like ['dt=2013-05', user='******'] # TODO(benkomalo): prompt/dry-run flags? # Step 2 - print locations print_locations(table_location, column_info, partition_cols, target_db,