Esempio n. 1
0
def pocket_read(p, jobid, iter, src_filename):
    for i in range(iter):
        dst_filename = 'tmp' + str(random.randint(
            1, 100000000000000)) + '-' + str(i)
        r = pocket.get(p, dst_filename, src_filename, jobid)
        if r != 0:
            raise Exception("get failed: " + dst_filename)
def lambda_handler(event, context):
    id = int(event['id'])
    n = num_workers = int(event['n'])
    bucket_name = str(event['bucket_name'])
    n_tasks = n

    log_file = []

    t0 = time.time()

    # connect to crail
    #p = pocket.connect("10.1.12.156", 9070)
    p = pocket.connect("10.1.0.10", 9070)
    print "connected"

    jobid = ""
    #jobid = str(event['id'])

    #read from input file: shuffle<0 id> shuffle<1 id> ... shuffle<id num_workers-1>
    #'''
    file_tmp = '/tmp/tmp'
    all_lines = []
    for i in xrange(n_tasks):
        #key = 'shuffle' + str(id) +'-'+ str(i) # wrong one just for testing
        key = 'shuffle' + str(i) + '-' + str(id)

        src_filename = key
        dst_filename = file_tmp
        #print src_filename
        r = pocket.get(p, src_filename, dst_filename, jobid)
        if r != 0:
            raise Exception("get failed: " + src_filename)
            return -1
        #log_file.append((key, time.time()))
        with open(dst_filename, "r") as f:
            all_lines += f.readlines()
        #print src_filename + " read success"
    os.remove(file_tmp)
    #'''

    t1 = time.time()
    #print "read all from pocket"

    #merge & sort
    for i in xrange(len(all_lines)):
        all_lines[i] = (all_lines[i][:10], all_lines[i][12:])
    all_lines.sort(key=lambda x: x[0])

    for i in xrange(len(all_lines)):
        all_lines[i] = all_lines[i][0] + "  " + all_lines[i][1]
    t2 = time.time()

    #[s3] write to output file: output<id>
    s3 = boto3.resource('s3')
    file_name = 'output/sorted_output'
    m = 1000 / n_tasks
    size = len(all_lines) / m
    for i in xrange(m):
        with open(file_tmp, "w+") as f:
            start = size * i
            end = start + size
            f.writelines(all_lines[start:end])
            f.seek(0)
            body = f.read()
        key = file_name + str(id * m + i)
        s3.Bucket(bucket_name).upload_file(file_tmp, key)

        os.remove(file_tmp)
    t3 = time.time()

    # upload log
    startup_nodes = [{
        "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com",
        "port": "6379"
    }]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)

    log = {'id': id, 't0': t0, 't1': t1, 't2': t2, 't3': t3}
    log_str = pickle.dumps(log)
    key = '/reduce-log' + '-' + '100GB' + '-' + str(n) + '-' + str(id)
    redis_client.set(key, log_str)
    print key + " logged"
    '''
    log_file_str = pickle.dumps(log_file)
    key = '/reduce-log-time'+'-'+'100GB'+'-'+str(n)+'-'+str(id)
    redis_client.set(key, log_file_str)
    print key + " logged" 
    '''
    #crail.close(socket, ticket, p)

    r = 'reduce finished ' + str(id)
    print r
    return r
Esempio n. 3
0
def pocket_read(p, jobid, iter, src_filename, id):
    for i in xrange(iter):
        dst_filename = '/tmp'+str(id)+'-'+str(i)
        r = pocket.get(p, dst_filename, src_filename, jobid)
Esempio n. 4
0
def lambda_handler(event, context):
    rid = int(event['rid'])
    n = num_mapper = int(event['num_mapper'])

    t0 = time.time()
    # read shuffle files
    # connect to crail
    p = pocket.connect("10.1.129.91", 9070)
    jobid = ""
    #jobid = str(event['id'])

    word_count_list = []
    for i in xrange(num_mapper):
        #shuffle_file = 'shuffle/shuffle-' + str(i) + '-' + str(rid)
        #body = pickle.loads(redis_client.get(shuffle_file))
        #word_count_list += body
        key = 'shuffle-' + str(i) + '-' + str(rid)
        src_filename = '/tmp/shuffle'
        dst_filename = '/' + key
        r = pocket.get(p, dst_filename, src_filename, jobid)
        if r != 0:
            raise Exception("get failed: " + dst_filename)
            return -1
        with open(src_filename, 'r') as f:
            word_count_list += json.load(f)

    os.remove(src_filename)

    t1 = time.time()
    # add up word count
    word_count = {}
    for (word, count) in word_count_list:
        if word in word_count:
            word_count[word] += count
        else:
            word_count[word] = count

    t2 = time.time()
    # write output to s3
    s3 = boto3.resource('s3')
    file_tmp = '/tmp/output'
    with open(file_tmp, "w+") as f:
        for k, v in word_count.items():
            f.write(str(k) + ' ' + str(v) + '\n')
    key = 'output/output' + str(rid)
    bucket_name = 'wordcount-yawen'
    s3.Bucket(bucket_name).upload_file(file_tmp, key)

    os.remove(file_tmp)

    t3 = time.time()

    # upload log
    startup_nodes = [{
        "host": "rediscluster-log.a9ith3.clustercfg.usw2.cache.amazonaws.com",
        "port": "6379"
    }]
    redis_client = StrictRedisCluster(startup_nodes=startup_nodes,
                                      skip_full_coverage_check=True)
    # t1-t0: intermediate read
    # t2-t1: adding word count
    # t3-t2: s3 write
    log = {'id': rid, 't0': t0, 't1': t1, 't2': t2, 't3': t3}
    key = 'reduce-log-' + str(n) + '-' + str(rid)
    redis_client.set(key, pickle.dumps(log))

    #print "reducer"+str(rid)+' finished'
    print key + ' logged'
Esempio n. 5
0
def pocket_read(p, jobid, iter, src_filename):
    for i in xrange(iter):
        dst_filename = '/tmp' + '-' + str(i)
        r = pocket.get(p, dst_filename, src_filename, jobid)
        if r != 0:
            raise Exception("get failed: " + dst_filename)