コード例 #1
0
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])
    bucket_name = str(event['bucket_name'])
    path = str(event['path'])
    n_tasks = n

    t0 = time.time()

    t1 = time.time()

    t2 = time.time()

    #write to output files: shuffle<id 0> shuffle<id 1> shuffle<id num_workers-1>
    p = crail.launch_dispatcher_from_lambda()
    socket = crail.connect()

    ticket = 1001
    file_tmp = '/tmp/input_tmp'

    t3 = time.time()

    # upload log
    log = {'id': id, 't0': t0, 't1': t1, 't2': t2, 't3': t3}
    file_tmp = '/tmp/tmp'
    with open(file_tmp, "w") as f:
        pickle.dump(log, f)
    src_filename = file_tmp
    dst_filename = '/invoke-logs-' + str(n) + '-' + str(id)
    r = crail.put(socket, src_filename, dst_filename, ticket)
    if r[-1] != u'\u0000':
        crail.close(socket, ticket, p)
        raise Exception("put failed: " + dst_filename)

    os.remove(file_tmp)

    crail.close(socket, ticket, p)

    #return time spent (in sec) writing intermediate files
    #return [t1-t0, t2-t1, t3-t2, t2-t2, [len(x)*100 for x in p_list]] #read input, compute, write shuffle

    r = 'lambda finished ' + str(id)
    print r
    return r
コード例 #2
0
def handler(event, context):
    crail.launch_dispatcher_from_lambda()

    call(["cp", "/var/task/lambda_java.py", "/tmp/lambda"])
    call(["cp", "/var/task/jars/crail-reflex-1.0.jar", "/tmp/crail-reflex"])

    time.sleep(20)

    socket = crail.connect()

    print "Talk to dispatcher..."
    src_filename = "/tmp/crail-reflex"
    dst_filename = "/dsttest-test-reflex2.data"
    ticket = 1001
    print "Try PUT..."
    start = time.time()
    crail.put(socket, src_filename, dst_filename, ticket)
    end = time.time()
    print "Execution time for single PUT: ", (end - start) * 1000000, " us\n"

    time.sleep(1)
    src_filename = "/dsttest-test-reflex2.data"
    dst_filename = "/tmp/crail-reflex-2"
    print "Now GET..."
    start = time.time()
    crail.get(socket, src_filename, dst_filename, ticket)
    end = time.time()
    print "Execution time for single GET: ", (end - start) * 1000000, " us\n"

    time.sleep(1)
    call(["ls", "-al", "/tmp/"])

    src_filename = "/dsttest-test-reflex2.data"
    print "Now DEL..."
    start = time.time()
    crail.delete(socket, src_filename, ticket)
    end = time.time()
    print "Execution time for single GET: ", (end - start) * 1000000, " us\n"

    return
コード例 #3
0
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])
    bucket_name = str(event['bucket_name'])
    n_tasks = n

    t0=time.time()

    p = crail.launch_dispatcher_from_lambda()
    socket = crail.connect()
    ticket = 1001

    #read from input file: shuffle<0 id> shuffle<1 id> ... shuffle<id num_workers-1>
    #'''
    file_tmp = '/tmp/tmp'
    all_lines = []
    for i in xrange(n_tasks):
        key = 'shuffle' + str(i) +'-'+ str(id)
        src_filename = '/' + key
        dst_filename = file_tmp
        r = crail.get(socket, src_filename, dst_filename, ticket)
        if r[-1] != u'\u0000':
            crail.close(socket, ticket, p)
            raise Exception("get failed: "+ src_filename)
        with open(file_tmp, "r") as f:
            all_lines+=f.readlines()
    os.remove(file_tmp)
    #'''
    
    t1 = time.time()


    #'''
    #merge & sort 
    for i in xrange(len(all_lines)):
        all_lines[i] = (all_lines[i][:10], all_lines[i][12:])
    all_lines.sort(key=lambda x: x[0])


    for i in xrange(len(all_lines)):
        all_lines[i] = all_lines[i][0]+"  "+all_lines[i][1]
    #'''
    t2=time.time()


    #[s3] write to output file: output<id>  
    s3 = boto3.resource('s3')
    file_name = 'output/sorted_output'
    m = 1000/n_tasks
    size = len(all_lines)/m
    for i in xrange(m):
        with open(file_tmp, "w+") as f:
            start = size*i
            end = start + size
            f.writelines(all_lines[start:end])
            f.seek(0)
            body = f.read()
        key = file_name + str(id*m+i)
        s3.Bucket(bucket_name).upload_file(file_tmp, key)

        os.remove(file_tmp)
    t3=time.time()

    # upload log
    log = {'id': id, 't0': t0, 't1': t1, 't2': t2, 't3': t3}
    file_tmp = '/tmp/tmp'
    with open(file_tmp, "w") as f:
        pickle.dump(log, f)
    src_filename = file_tmp
    dst_filename = '/reduce-logs-100GB-'+str(n)+'-'+str(id)
    ## new file
    r = crail.put(socket, src_filename, dst_filename, ticket)
    if r[-1] != u'\u0000':
        crail.close(socket, ticket, p)
        raise Exception("put failed: "+ dst_filename)
 
    log = [t1-t0, t2-t1, t3-t2, t1-t1]
    file_tmp = '/tmp/tmp'
    with open(file_tmp, "w") as f:
        pickle.dump(log, f)
    src_filename = file_tmp
    dst_filename = '/reduce-results-100GB-'+str(n)+'-'+str(id)
    ## new file
    r = crail.put(socket, src_filename, dst_filename, ticket)
    if r[-1] != u'\u0000':
        crail.close(socket, ticket, p)
        raise Exception("put failed: "+ dst_filename)



    crail.close(socket, ticket, p)
    #return time (in sec) spent reading intermediate files
    #return [t1-t0, t2-t1, t3-t2, t1-t1] #read shuffle, compute, write output 

    r = 'reduce finished ' + str(id)
    print r
    return r
コード例 #4
0
ファイル: map_lambda_crail.py プロジェクト: asqasq/lambdasort
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])
    bucket_name = str(event['bucket_name'])
    path = str(event['path'])
    n_tasks = n

    t0=time.time()

    #[s3] read from input file: input<id> 
    s3 = boto3.resource('s3')
    file_local = '/tmp/input_tmp'
    lines = []
    # read 4 100MB files
    m = 1000/n_tasks
    for i in xrange(m):
        i += id*m
        key = path + 'input' + str(i)
        s3.Bucket(bucket_name).download_file(key, file_local)
        with open(file_local, "r") as f:
            lines += f.readlines() #each line contains a 100b record
        os.remove(file_local)

    t1=time.time()

    #partition 
    p_list = [[] for x in xrange(n_tasks)]  #list of n partitions  #hardcode
    for line in lines:
        key1 = ord(line[0])-32 # key range 32-126
        key2 = ord(line[1])-32
        #126-32+1=95
        #p = n/95 # 2500/(126-32+1) ~ 26.3 = 26
        #index = int(26.3*(key1+key2/95.0))  
        p = n_tasks/95.0 # total of 250 tasks 
        index = int(p*(key1+key2/95.0))
        p_list[index].append(line)

    t2=time.time()

    #write to output files: shuffle<id 0> shuffle<id 1> shuffle<id num_workers-1>     
    p = crail.launch_dispatcher_from_lambda()
    socket = crail.connect()

    ticket = 1001
    file_tmp = file_local
    for i in xrange(n_tasks):
        with open(file_tmp, "w") as f:
            f.writelines(p_list[i])
        key = 'shuffle' + str(id) +'-'+ str(i)
        src_filename = file_tmp
        dst_filename = '/' + key
        r = crail.put(socket, src_filename, dst_filename, ticket)
        if r[-1] != u'\u0000':
            crail.close(socket, ticket, p)
            raise Exception("put failed: "+ dst_filename)

    t3=time.time()

    # upload log
    log = {'id': id, 't0': t0, 't1': t1, 't2': t2, 't3': t3, 'file_size': [len(x)*100 for x in p_list]}
    file_tmp = '/tmp/tmp'
    with open(file_tmp, "w") as f:
        pickle.dump(log, f)
    src_filename = file_tmp
    dst_filename = '/map-logs-100GB-'+str(n)+'-'+str(id)
    r = crail.put(socket, src_filename, dst_filename, ticket)
    if r[-1] != u'\u0000':
        crail.close(socket, ticket, p)
        raise Exception("put failed: "+ dst_filename)

    log = [t1-t0, t2-t1, t3-t2, t2-t2]
    file_tmp = '/tmp/tmp'
    with open(file_tmp, "w") as f:
        pickle.dump(log, f)
    src_filename = file_tmp
    dst_filename = '/map-results-100GB-'+str(n)+'-'+str(id)
    r = crail.put(socket, src_filename, dst_filename, ticket)
    if r[-1] != u'\u0000':
        crail.close(socket, ticket, p)
        raise Exception("put failed: "+ dst_filename)


    os.remove(file_tmp)

    crail.close(socket, ticket, p)


    #return time spent (in sec) writing intermediate files 
    #return [t1-t0, t2-t1, t3-t2, t2-t2, [len(x)*100 for x in p_list]] #read input, compute, write shuffle 

    r = 'map finished ' + str(id)
    print r
    return r
コード例 #5
0
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])
    bucket_name = str(event['bucket_name'])
    path = str(event['path'])
    n_tasks = n

    STOP = threading.Event()
    LOGS_PATH = 'map-logs-' + str(n)

    class TimeLog:
        def __init__(self, enabled=True):
            self.enabled = enabled
            self.start = time.time()
            self.prev = self.start
            self.points = []
            self.sizes = []

        def add_point(self, title):
            if not self.enabled:
                return
            now = time.time()
            self.points += [(title, now - self.prev)]
            self.prev = now

    def upload_net_bytes(rclient, rxbytes_per_s, txbytes_per_s, timelogger,
                         reqid):
        #rclient = redis.Redis(host=REDIS_HOSTADDR_PRIV, port=6379, db=0)
        netstats = LOGS_PATH + '/netstats-' + reqid
        rclient.set(
            netstats,
            str({
                'lambda': reqid,
                'started': timelogger.start,
                'rx': rxbytes_per_s,
                'tx': txbytes_per_s
            }).encode('utf-8'))
        print "wrote netstats"
        return

    def get_net_bytes(rxbytes, txbytes, rxbytes_per_s, txbytes_per_s):
        SAMPLE_INTERVAL = 1.0
        # schedule the function to execute every SAMPLE_INTERVAL seconds
        if STOP.is_set():
            threading.Timer(
                SAMPLE_INTERVAL, get_net_bytes,
                [rxbytes, txbytes, rxbytes_per_s, txbytes_per_s]).start()
            rxbytes.append(int(ifcfg.default_interface()['rxbytes']))
            txbytes.append(int(ifcfg.default_interface()['txbytes']))
            rxbytes_per_s.append((rxbytes[-1] - rxbytes[-2]) / SAMPLE_INTERVAL)
            txbytes_per_s.append((txbytes[-1] - txbytes[-2]) / SAMPLE_INTERVAL)

    t0 = time.time()

    #[s3] read from input file: input<id>
    s3 = boto3.resource('s3')
    file_local = '/tmp/input_tmp'
    lines = []
    # read 4 100MB files
    m = 1000 / n_tasks
    for i in xrange(m):
        i += id * m
        key = path + 'input' + str(i)
        s3.Bucket(bucket_name).download_file(key, file_local)
        with open(file_local, "r") as f:
            lines += f.readlines()  #each line contains a 100b record
        os.remove(file_local)

    t1 = time.time()

    #partition
    p_list = [[] for x in xrange(n_tasks)]  #list of n partitions  #hardcode
    for line in lines:
        key1 = ord(line[0]) - 32  # key range 32-126
        key2 = ord(line[1]) - 32
        #126-32+1=95
        #p = n/95 # 2500/(126-32+1) ~ 26.3 = 26
        #index = int(26.3*(key1+key2/95.0))
        p = n_tasks / 95.0  # total of 250 tasks
        index = int(p * (key1 + key2 / 95.0))
        p_list[index].append(line)

    # start collecting network data
    iface = ifcfg.default_interface()
    rxbytes = [int(iface['rxbytes'])]
    txbytes = [int(iface['txbytes'])]
    rxbytes_per_s = []
    txbytes_per_s = []
    STOP.set()
    get_net_bytes(rxbytes, txbytes, rxbytes_per_s, txbytes_per_s)

    t2 = time.time()

    #write to output files: shuffle<id 0> shuffle<id 1> shuffle<id num_workers-1>
    p = crail.launch_dispatcher_from_lambda()
    socket = crail.connect()

    ticket = 1001
    file_tmp = file_local
    for i in xrange(n_tasks):
        with open(file_tmp, "w") as f:
            f.writelines(p_list[i])
        key = 'shuffle' + str(id) + '-' + str(i)
        src_filename = file_tmp
        dst_filename = '/' + key
        r = crail.put(socket, src_filename, dst_filename, ticket)
        if r[-1] != u'\u0000':
            crail.close(socket, ticket, p)
            raise Exception("put failed: " + dst_filename)

    t3 = time.time()

    #upload network data
    timelogger = TimeLog(enabled=True)
    startup_nodes = [{
        "host": "rediscluster.a9ith3.clustercfg.usw2.cache.amazonaws.com",
        "port": "6379"
    }]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)
    rclient = redis_client
    STOP.clear()
    upload_net_bytes(rclient, rxbytes_per_s, txbytes_per_s, timelogger,
                     str(id))

    # upload log
    log = {
        'id': id,
        't0': t0,
        't1': t1,
        't2': t2,
        't3': t3,
        'file_size': [len(x) * 100 for x in p_list]
    }
    file_tmp = '/tmp/tmp'
    with open(file_tmp, "w") as f:
        pickle.dump(log, f)
    src_filename = file_tmp
    dst_filename = '/map-logs-100GB-' + str(n) + '-' + str(id)
    r = crail.put(socket, src_filename, dst_filename, ticket)
    if r[-1] != u'\u0000':
        crail.close(socket, ticket, p)
        raise Exception("put failed: " + dst_filename)

    log = [t1 - t0, t2 - t1, t3 - t2, t2 - t2]
    file_tmp = '/tmp/tmp'
    with open(file_tmp, "w") as f:
        pickle.dump(log, f)
    src_filename = file_tmp
    dst_filename = '/map-results-100GB-' + str(n) + '-' + str(id)
    r = crail.put(socket, src_filename, dst_filename, ticket)
    if r[-1] != u'\u0000':
        crail.close(socket, ticket, p)
        raise Exception("put failed: " + dst_filename)

    os.remove(file_tmp)

    crail.close(socket, ticket, p)

    #return time spent (in sec) writing intermediate files
    #return [t1-t0, t2-t1, t3-t2, t2-t2, [len(x)*100 for x in p_list]] #read input, compute, write shuffle

    r = 'map finished ' + str(id)
    print r
    return r
コード例 #6
0
def handler(id):
    p = crail.launch_dispatcher_from_lambda()

    call(["cp", "/var/task/lambda_java.py", "/tmp/lambda"])
    call(["cp", "/var/task/jars/crail-reflex-1.0.jar", "/tmp/crail-reflex"])
    #call(["cp", "/var/task/jars/crail-client-1.0.jar", "/tmp/crail-reflex"])

    socket = crail.connect()

    result = []
    result.append("Talk to dispatcher...")
    src_filename = "/tmp/crail-reflex"
    #dst_filename = "/dsttest-test-reflex2.data"
    dst_filename = "/data" + str(id)
    ticket = 1001
    result.append("Try PUT...")
    start = time.time()
    crail.put(socket, src_filename, dst_filename, ticket)
    '''
    for i in range(100):
        dst_filename = "/id" + str(id) + str(i)
        crail.put(socket, src_filename, dst_filename, ticket)
    '''
    end = time.time()
    result.append("Execution time for single PUT: " +
                  str((end - start) * 1000000) + " us")

    crail.close(socket, ticket, p)
    return 0

    print "storing logs"
    result.append("storing logs")
    log = {'id': 1, 's3read': 2, 'compute': 3, 'write': 4}
    file_tmp = '/tmp/tmp'
    with open(file_tmp, "w") as f:
        f.write(json.dumps(log))
    src_filename = file_tmp
    dst_filename = '/map-logs-100GB' + str(n)
    crail.put(socket, src_filename, dst_filename, ticket)

    crail.close(socket, ticket, p)
    return result

    time.sleep(1)
    #src_filename = "/dsttest-test-reflex2.data"
    src_filename = dst_filename
    dst_filename = "/tmp/crail-reflex-2"
    result.append("Now GET...")
    start = time.time()
    crail.get(socket, src_filename, dst_filename, ticket)
    end = time.time()
    result.append("Execution time for single GET: " +
                  str((end - start) * 1000000) + " us")
    '''
    time.sleep(1)
    call(["ls", "-al", "/tmp/"])
    
    src_filename = "/dsttest-test-reflex2.data"
    print "Now DEL..."
    start = time.time()
    crail.delete(socket, src_filename, ticket)
    end = time.time()
    print "Execution time for single GET: ", (end-start) * 1000000, " us\n"
    '''

    return result
コード例 #7
0
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])
    bucket_name = str(event['bucket_name'])
    n_tasks = n

    STOP = threading.Event()
    LOGS_PATH = 'reduce-logs-' + str(n)

    class TimeLog:
        def __init__(self, enabled=True):
            self.enabled = enabled
            self.start = time.time()
            self.prev = self.start
            self.points = []
            self.sizes = []

        def add_point(self, title):
            if not self.enabled:
                return
            now = time.time()
            self.points += [(title, now - self.prev)]
            self.prev = now

    def upload_net_bytes(rclient, rxbytes_per_s, txbytes_per_s, timelogger,
                         reqid):
        #rclient = redis.Redis(host=REDIS_HOSTADDR_PRIV, port=6379, db=0)
        netstats = LOGS_PATH + '/netstats-' + reqid
        rclient.set(
            netstats,
            str({
                'lambda': reqid,
                'started': timelogger.start,
                'rx': rxbytes_per_s,
                'tx': txbytes_per_s
            }).encode('utf-8'))
        print "wrote netstats"
        return

    def get_net_bytes(rxbytes, txbytes, rxbytes_per_s, txbytes_per_s):
        SAMPLE_INTERVAL = 1.0
        # schedule the function to execute every SAMPLE_INTERVAL seconds
        if STOP.is_set():
            threading.Timer(
                SAMPLE_INTERVAL, get_net_bytes,
                [rxbytes, txbytes, rxbytes_per_s, txbytes_per_s]).start()
            rxbytes.append(int(ifcfg.default_interface()['rxbytes']))
            txbytes.append(int(ifcfg.default_interface()['txbytes']))
            rxbytes_per_s.append((rxbytes[-1] - rxbytes[-2]) / SAMPLE_INTERVAL)
            txbytes_per_s.append((txbytes[-1] - txbytes[-2]) / SAMPLE_INTERVAL)

    # start collecting network data
    iface = ifcfg.default_interface()
    rxbytes = [int(iface['rxbytes'])]
    txbytes = [int(iface['txbytes'])]
    rxbytes_per_s = []
    txbytes_per_s = []
    STOP.set()
    get_net_bytes(rxbytes, txbytes, rxbytes_per_s, txbytes_per_s)

    t0 = time.time()

    p = crail.launch_dispatcher_from_lambda()
    socket = crail.connect()
    ticket = 1001

    #read from input file: shuffle<0 id> shuffle<1 id> ... shuffle<id num_workers-1>
    #'''
    file_tmp = '/tmp/tmp'
    all_lines = []
    for i in xrange(n_tasks):
        key = 'shuffle' + str(i) + '-' + str(id)
        src_filename = '/' + key
        dst_filename = file_tmp
        r = crail.get(socket, src_filename, dst_filename, ticket)
        if r[-1] != u'\u0000':
            crail.close(socket, ticket, p)
            raise Exception("get failed: " + src_filename)
        with open(file_tmp, "r") as f:
            all_lines += f.readlines()
    os.remove(file_tmp)
    #'''

    t1 = time.time()

    #upload network data
    timelogger = TimeLog(enabled=True)
    startup_nodes = [{
        "host": "rediscluster.a9ith3.clustercfg.usw2.cache.amazonaws.com",
        "port": "6379"
    }]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)
    rclient = redis_client
    STOP.clear()
    upload_net_bytes(rclient, rxbytes_per_s, txbytes_per_s, timelogger,
                     str(id))

    t1_2 = time.time()

    #'''
    #merge & sort
    for i in xrange(len(all_lines)):
        all_lines[i] = (all_lines[i][:10], all_lines[i][12:])
    all_lines.sort(key=lambda x: x[0])

    for i in xrange(len(all_lines)):
        all_lines[i] = all_lines[i][0] + "  " + all_lines[i][1]
    #'''
    t2 = time.time()

    #[s3] write to output file: output<id>
    s3 = boto3.resource('s3')
    file_name = 'output/sorted_output'
    m = 1000 / n_tasks
    size = len(all_lines) / m
    for i in xrange(m):
        with open(file_tmp, "w+") as f:
            start = size * i
            end = start + size
            f.writelines(all_lines[start:end])
            f.seek(0)
            body = f.read()
        key = file_name + str(id * m + i)
        s3.Bucket(bucket_name).upload_file(file_tmp, key)

        os.remove(file_tmp)
    t3 = time.time()

    # upload log
    log = {'id': id, 't0': t0, 't1': t1_2, 't2': t2, 't3': t3}
    file_tmp = '/tmp/tmp'
    with open(file_tmp, "w") as f:
        pickle.dump(log, f)
    src_filename = file_tmp
    dst_filename = '/reduce-logs-100GB-' + str(n) + '-' + str(id)
    ## new file
    r = crail.put(socket, src_filename, dst_filename, ticket)
    if r[-1] != u'\u0000':
        crail.close(socket, ticket, p)
        raise Exception("put failed: " + dst_filename)

    log = [t1 - t0, t2 - t1_2, t3 - t2, t1_2 - t1]
    file_tmp = '/tmp/tmp'
    with open(file_tmp, "w") as f:
        pickle.dump(log, f)
    src_filename = file_tmp
    dst_filename = '/reduce-results-100GB-' + str(n) + '-' + str(id)
    ## new file
    r = crail.put(socket, src_filename, dst_filename, ticket)
    if r[-1] != u'\u0000':
        crail.close(socket, ticket, p)
        raise Exception("put failed: " + dst_filename)

    crail.close(socket, ticket, p)
    #return time (in sec) spent reading intermediate files
    #return [t1-t0, t1_2-t1, t3-t2, t2-t1_2] #read shuffle, compute, write output

    r = 'reduce finished ' + str(id)
    print r
    return r