def wait_for_data(wait_for_config, options):
    """Wait for data before kicking off hive jobs"""
    # Step 1 - read meta data.
    hive_mysql_connector.configure(options.hive_masternode,
        options.ssh_keyfile)

    if options.hive_init:
        hive_mysql_connector.run_hive_init()
    # Step 2 - wait for all the data partitions are available
    boto_util.initialize_creds_from_file()
    s3conn = boto.connect_s3()
    s3bucket = s3conn.get_bucket('ka-mapreduce')
    max_wait = datetime.timedelta(hours=options.max_wait)
    start = datetime.datetime.now()
    for d in wait_for_config:
        table = d['table_name']
        table_location = hive_mysql_connector.get_table_location(table)
        for p in d['partitions']:
            partition_location = table_location + '/' + p
            #TODO(yunfang): abstract the following to wait_for_partition
            #               for boto_util
            while True:
                if partition_available(s3bucket, partition_location):
                    g_logger.info("%s is available" % (partition_location))
                    break
                if (datetime.datetime.now() - start) > max_wait:
                    # Wait for a long time already. Can't wait any more
                    g_logger.fatal("Wait for too long. "
                                   "Data is still not available."
                                   "Exiting...")
                    sys.exit(1)
                # Wait for a minute to check again
                g_logger.info("Waiting for %s to be available... " % (
                              partition_location))
                time.sleep(60)
def wait_for_data(wait_for_config, options):
    """Wait for data before kicking off hive jobs"""
    # Step 1 - read meta data.
    hive_mysql_connector.configure(options.hive_masternode,
        options.ssh_keyfile)

    if options.hive_init:
        hive_mysql_connector.run_hive_init()
    # Step 2 - wait for all the data partitions are available
    boto_util.initialize_creds_from_file()
    s3conn = boto.connect_s3()
    s3bucket = s3conn.get_bucket('ka-mapreduce')
    max_wait = datetime.timedelta(hours=options.max_wait)
    start = datetime.datetime.now()
    for d in wait_for_config:
        table = d['table_name']
        table_location = hive_mysql_connector.get_table_location(table)
        for p in d['partitions']:
            partition_location = table_location + '/' + p
            #TODO(yunfang): abstract the following to wait_for_partition
            #               for boto_util
            while True:
                if partition_available(s3bucket, partition_location):
                    g_logger.info("%s is available" % (partition_location))
                    break
                if (datetime.datetime.now() - start) > max_wait:
                    # Wait for a long time already. Can't wait any more
                    g_logger.fatal("Wait for too long. "
                                   "Data is still not available."
                                   "Exiting...")
                    sys.exit(1)
                # Wait for a minute to check again
                g_logger.info("Waiting for %s to be available... " % (
                              partition_location))
                time.sleep(60)
Beispiel #3
0
            options.report_db_host,
            options.report_db_port or '[default_mongo_port]',
            target_db,
            target_collection)
    print "\nOutput:"
    print "\t%s" % mongo_path


if __name__ == '__main__':
    start_dt = datetime.datetime.now()

    options, args = parse_command_line_args()

    # Step 1 - read meta data.
    hive_masternode = args[0]
    hive_mysql_connector.configure(hive_masternode, options.ssh_keyfile)
        
    if options.hive_init:
        hive_mysql_connector.run_hive_init()

    table_name = args[1]
    print "Fetching table info..."
    table_location = hive_mysql_connector.get_table_location(table_name)

    if not table_location:
        raise Exception("Can't read info about %s in Hive master %s" %
                        (hive_masternode, table_name))
    if not table_location.startswith('s3://ka-mapreduce/'):
        raise Exception("Can only import from s3://ka-mapreduce for now")
    column_info = hive_mysql_connector.get_table_columns(table_name)
        options.report_db_host, options.report_db_port
        or '[default_mongo_port]', target_db, target_collection)
    print "\nOutput:"
    print "\t%s" % mongo_path


if __name__ == '__main__':
    start_dt = datetime.datetime.now()

    # Named arguments appear as properties on the options object
    # Unnamed arguments appears as elements in the args array
    options, args = parse_command_line_args()

    # Step 1 - read meta data.
    hive_masternode = args[0]  # Generally, 'ka-hive'
    hive_mysql_connector.configure(hive_masternode, options.ssh_keyfile)

    if options.hive_init:
        hive_mysql_connector.run_hive_init()

    table_name = args[1]  # Generally == target_collection = args[3]
    print "Fetching table info..."
    table_location = hive_mysql_connector.get_table_location(table_name)

    if not table_location:
        raise Exception("Can't read info about %s in Hive master %s" %
                        (hive_masternode, table_name))
    if not table_location.startswith('s3://ka-mapreduce/'):
        raise Exception("Can only import from s3://ka-mapreduce for now")
    column_info = hive_mysql_connector.get_table_columns(table_name)