# Setting longer timeout for reading lambda results and larger connections pool
lambda_config = Config(read_timeout=lambda_read_timeout,
                       max_pool_connections=boto_max_connections)
lambda_client = boto3.client('lambda', config=lambda_config)

# Fetch all the keys that match the prefix
all_keys = []
for obj in s3.Bucket(bucket).objects.filter(Prefix=config["prefix"]).all():
    all_keys.append(obj)

print(all_keys)

#exit(0)

bsize = lambdautils.compute_batch_size(all_keys, lambda_memory,
                                       concurrent_lambdas)
#print(bsize)
#batches = lambdautils.batch_creator(all_keys, bsize)

#print(batches)

batches = lambdautils.batch_creator_for_ny_trip(all_keys, lambda_memory,
                                                concurrent_lambdas)
print(batches)

#exit(0)
n_mappers = len(batches)
document = xray_recorder.current_subsegment()
document.put_metadata("Batch size: ", bsize, "Processing initialization")
document.put_metadata("Mappers: ", n_mappers, "Processing initialization")
xray_recorder.end_subsegment()  #Get all keys to be processed
def get_reducer_batch_size(keys):
    #TODO: Paramertize memory size
    batch_size = lambdautils.compute_batch_size(keys, 1536)
    return max(batch_size, 2) # At least 2 in a batch - Condition for termination
# 1. Get all keys to be processed
# init
bucket = config["bucket"]
job_bucket = config["jobBucket"]
region = config["region"]
lambda_memory = config["lambdaMemory"]
concurrent_lambdas = config["concurrentLambdas"]

#all_keys = s3_client.list_objects(Bucket=bucket, Prefix=config["prefix"])["Contents"]

# Fetch all the keys that match the prefix
all_keys = []
for obj in s3.Bucket(bucket).objects.filter(Prefix=config["prefix"]).all():
    all_keys.append(obj)

bsize = lambdautils.compute_batch_size(all_keys, lambda_memory)
batches = lambdautils.batch_creator(all_keys, bsize)
n_mappers = len(batches)

# 2. Create the lambda functions

L_PREFIX = "BL"

# Lambda functions
mapper_lambda_name = L_PREFIX + "-mapper-" + job_id
reducer_lambda_name = L_PREFIX + "-reducer-" + job_id
rc_lambda_name = L_PREFIX + "-rc-" + job_id

# write job config
write_job_config(job_id, job_bucket, n_mappers, reducer_lambda_name,
                 config["reducer"]["handler"])
Beispiel #4
0
def get_reducer_batch_size(keys):
    #TODO: Paramertize memory size
    batch_size = lambdautils.compute_batch_size(keys, 1536)
    return max(batch_size,
               2)  # At least 2 in a batch - Condition for termination
def get_reducer_batch_size(keys):
    # TODO: Paramertize memory size
    batch_size = lambdautils.compute_batch_size(keys, 1536, 1000)
    return max(batch_size, 2)  # 적어도 배치가 2개 라면 - 종료
Beispiel #6
0
def handler(event, context):
    entry = time.time() * 1000
    logger = logging.getLogger()
    logger.setLevel(logging.WARN)
    if not event: #requires arguments
        print('No event was passed to the handler, exiting...')
        return

    if 'mapper' not in event or 'reducer' not in event:
        print('No mapper or reducer function names given, unable to proceed, exiting...')
        return

    # create an S3 session
    if not context: #calling from main
        boto3.setup_default_session(profile_name='cjk1')
    s3 = boto3.resource('s3')
    config = botocore.client.Config(connect_timeout=50, read_timeout=200)
    s3_client = boto3.client('s3',config=config)

    JOB_INFO = 'jobinfo.json'

    # 1. Get all keys to be processed  
    # init 
    endearly = 0
    if 'endearly' in event:
        endearly = int(event['endearly'])
    bucket = event["bucket"]
    dryrun = True if "dryrun" in event else False
    lambda_memory = 1536

    # Fetch all the keys that match the prefix
    all_keys = []
    for obj in s3.Bucket(bucket).objects.filter(Prefix=event["prefix"]).all():
        all_keys.append(obj)
    
    bsize = lambdautils.compute_batch_size(all_keys, lambda_memory)
    batches = lambdautils.batch_creator(all_keys, bsize)
    n_mappers = len(batches)
    if endearly > 0 and endearly < n_mappers:
        n_mappers = endearly
    print("Num. of Mappers (and Reducers) ", n_mappers)

    if dryrun: #don't go any further
        delta = (time.time() * 1000) - entry
        me_str = 'TIMER:CALL:{}:dryrun:0'.format(delta)
        logger.warn(me_str)
        return me_str

    #process the remaining arguments
    job_id = event["job_id"]
    job_bucket = event["jobBucket"]
    region = event["region"]
    async = True if "full_async" in event else False
    reducer_lambda_name = event["reducer"]
    mapper_lambda_name = event["mapper"]
    
    # Write Jobdata to S3
    j_key = job_id + "/jobdata";
    data = json.dumps({
        "mapCount": n_mappers, 
        "totalS3Files": len(all_keys),
        "startTime": time.time()
        })
    write_to_s3(s3, job_bucket, j_key, data, {})
    data = json.dumps({
        "jobId": job_id,
        "jobBucket" : job_bucket,
        "mapCount": n_mappers,
        "reducerFunction": reducer_lambda_name,
        "reducerHandler": "{}.handler".format(reducer_lambda_name)
        }, indent=4);
    j_key = job_id + "/jobinfo.json";
    write_to_s3(s3,job_bucket,j_key,data,{})

    ### Execute ###
    total_lambda_secs = 0
    reducer_lambda_time = 0
    mapper_outputs = []

    if async: #asynchronous invocation of mappers
        for i in range(n_mappers):
            invoke_lambda(mapper_lambda_name,batches,bucket,job_bucket,job_id,i)

    else: #synchronous invocation of mappers on parallel threads
        pool = ThreadPool(n_mappers)
        Ids = [i+1 for i in range(n_mappers)]
        invoke_lambda_partial = partial(invoke_lambda_sync,mapper_lambda_name,batches,mapper_outputs,bucket,job_bucket,job_id)
        
        # Burst request handling
        mappers_executed = 0
        concurrent_lambdas = 100 #only used by synchronous run (use --dryrun to see how many actual mappers are needed
        while mappers_executed < n_mappers:
            nm = min(concurrent_lambdas, n_mappers)
            results = pool.map(invoke_lambda_partial, Ids[mappers_executed: mappers_executed + nm])
            mappers_executed += nm
    
        pool.close()
        pool.join()
    
    for output in mapper_outputs:
        if 'body' in output:
            total_lambda_secs += float(output['body'][2])
        else:
            total_lambda_secs += float(output[2])
    
    if not async:
        #Note: Wait for the job to complete so that we can compute total cost ; create a poll every 10 secs
        # Get all reducer keys
        reducer_keys = []
        # Total execution time for reducers
    
        while True:
            job_keys = s3_client.list_objects(Bucket=job_bucket, Prefix=job_id)["Contents"]
            keys = [jk["Key"] for jk in job_keys]
            total_s3_size = sum([jk["Size"] for jk in job_keys])
            
            logger.info("checking if job is done")
        
            # check job done
            if job_id + "/result" in keys:
                reducer_lambda_time += float(s3.Object(job_bucket, job_id + "/result").metadata['processingtime'])
                for key in keys:
                    if "task/reducer" in key:
                        reducer_lambda_time += float(s3.Object(job_bucket, key).metadata['processingtime'])
                        reducer_keys.append(key)
                break
            time.sleep(5)
        
    delta = (time.time() * 1000) - entry
    me_str = 'TIMER:CALL:{}:mappers:{}:reducer:{}'.format(delta,total_lambda_secs,reducer_lambda_time)
    logger.warn(me_str)
    return me_str
Beispiel #7
0
def handler(event, context):
    entry = time.time() * 1000
    logger = logging.getLogger()
    logger.setLevel(logging.WARN)
    if not event:  #requires arguments
        print('No event was passed to the handler, exiting...')
        return

    if 'mapper' not in event or 'reducer' not in event:
        print(
            'No mapper or reducer function names given, unable to proceed, exiting...'
        )
        return

    # create an S3 session
    if not context:  #calling from main
        boto3.setup_default_session(profile_name='cjk1')
    s3 = boto3.resource('s3')
    config = botocore.client.Config(connect_timeout=50, read_timeout=200)
    s3_client = boto3.client('s3', config=config)

    JOB_INFO = 'jobinfo.json'

    # 1. Get all keys to be processed
    # init
    endearly = 0
    if 'endearly' in event:
        endearly = int(event['endearly'])
    bucket = event["bucket"]
    dryrun = True if "dryrun" in event else False
    lambda_memory = 1536

    # Fetch all the keys that match the prefix
    all_keys = []
    for obj in s3.Bucket(bucket).objects.filter(Prefix=event["prefix"]).all():
        all_keys.append(obj)

    bsize = lambdautils.compute_batch_size(all_keys, lambda_memory)
    batches = lambdautils.batch_creator(all_keys, bsize)
    n_mappers = len(batches)
    if endearly > 0 and endearly < n_mappers:
        n_mappers = endearly
    print("Num. of Mappers (and Reducers) ", n_mappers)

    if dryrun:  #don't go any further
        delta = (time.time() * 1000) - entry
        me_str = 'TIMER:CALL:{}:dryrun:0'.format(delta)
        logger.warn(me_str)
        return me_str

    #process the remaining arguments
    job_id = event["job_id"]
    job_bucket = event["jobBucket"]
    region = event["region"]
    async = True if "full_async" in event else False
    reducer_lambda_name = event["reducer"]
    mapper_lambda_name = event["mapper"]

    # Write Jobdata to S3
    j_key = job_id + "/jobdata"
    data = json.dumps({
        "mapCount": n_mappers,
        "totalS3Files": len(all_keys),
        "startTime": time.time()
    })
    write_to_s3(s3, job_bucket, j_key, data, {})
    data = json.dumps(
        {
            "jobId": job_id,
            "jobBucket": job_bucket,
            "mapCount": n_mappers,
            "reducerFunction": reducer_lambda_name,
            "reducerHandler": "{}.handler".format(reducer_lambda_name)
        },
        indent=4)
    j_key = job_id + "/jobinfo.json"
    write_to_s3(s3, job_bucket, j_key, data, {})

    ### Execute ###
    total_lambda_secs = 0
    reducer_lambda_time = 0
    mapper_outputs = []

    if async:  #asynchronous invocation of mappers
        for i in range(n_mappers):
            invoke_lambda(mapper_lambda_name, batches, bucket, job_bucket,
                          job_id, i)

    else:  #synchronous invocation of mappers on parallel threads
        pool = ThreadPool(n_mappers)
        Ids = [i + 1 for i in range(n_mappers)]
        invoke_lambda_partial = partial(invoke_lambda_sync, mapper_lambda_name,
                                        batches, mapper_outputs, bucket,
                                        job_bucket, job_id)

        # Burst request handling
        mappers_executed = 0
        concurrent_lambdas = 100  #only used by synchronous run (use --dryrun to see how many actual mappers are needed
        while mappers_executed < n_mappers:
            nm = min(concurrent_lambdas, n_mappers)
            results = pool.map(invoke_lambda_partial,
                               Ids[mappers_executed:mappers_executed + nm])
            mappers_executed += nm

        pool.close()
        pool.join()

    for output in mapper_outputs:
        if 'body' in output:
            total_lambda_secs += float(output['body'][2])
        else:
            total_lambda_secs += float(output[2])

    if not async:
        #Note: Wait for the job to complete so that we can compute total cost ; create a poll every 10 secs
        # Get all reducer keys
        reducer_keys = []
        # Total execution time for reducers

        while True:
            job_keys = s3_client.list_objects(Bucket=job_bucket,
                                              Prefix=job_id)["Contents"]
            keys = [jk["Key"] for jk in job_keys]
            total_s3_size = sum([jk["Size"] for jk in job_keys])

            logger.info("checking if job is done")

            # check job done
            if job_id + "/result" in keys:
                reducer_lambda_time += float(
                    s3.Object(job_bucket,
                              job_id + "/result").metadata['processingtime'])
                for key in keys:
                    if "task/reducer" in key:
                        reducer_lambda_time += float(
                            s3.Object(job_bucket,
                                      key).metadata['processingtime'])
                        reducer_keys.append(key)
                break
            time.sleep(5)

    delta = (time.time() * 1000) - entry
    me_str = 'TIMER:CALL:{}:mappers:{}:reducer:{}'.format(
        delta, total_lambda_secs, reducer_lambda_time)
    logger.warn(me_str)
    return me_str
Beispiel #8
0
def get_reducer_batch_size(keys, lambda_memory, concurrent_lambdas):
    batch_size = lambdautils.compute_batch_size(keys, lambda_memory,
                                                concurrent_lambdas)
    return max(batch_size,
               2)  # At least 2 in a batch - Condition for termination