예제 #1
0
def clear_bucket(bucket_name):
    objects = list_bucket_objects(bucket_name)
    if objects is not None:
        file_names = []
        for obj in objects:
            file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8')
            file_names.append(file_key)
        if len(file_names) >= 1:
            #print("delete files {} in bucket {}".format(file_names, bucket_name))
            delete_objects(bucket_name, file_names)
    return True
예제 #2
0
def delete_expired_merged_epoch(bucket_name, cur_epoch):
    objects = list_bucket_objects(bucket_name)
    if objects is not None:
        file_names = []
        for obj in objects:
            file_key = urllib.parse.unquote_plus(obj["Key"], encoding='utf-8')
            key_splits = file_key.split("_")
            key_epoch = int(key_splits[-1])
            if key_epoch < cur_epoch:
                file_names.append(file_key)
        if len(file_names) >= 1:
            #print("delete files {} in bucket {}".format(file_names, bucket_name))
            delete_objects(bucket_name, file_names)
예제 #3
0
def reduce_batch(vector, tmp_bucket, merged_bucket, num_workers, worker_index,
                 postfix):
    # vector is supposed to be a 1-d numpy array
    vec_shape = vector.shape
    vec_dtype = vector.dtype
    merged_vec = np.zeros(vec_shape, dtype=vec_dtype)

    postfix_splits = postfix.split("_")
    curr_epoch = int(postfix_splits[0])
    curr_batch = int(postfix_splits[1])

    # put object to s3, format of key: workerID_epoch_batch
    key = "{}_{}".format(worker_index, postfix)
    put_object(tmp_bucket, key, vector.tobytes())

    # the first worker read and aggregate the corresponding chunk
    if worker_index == 0:
        num_files = 0
        while num_files < num_workers:
            objects = list_bucket_objects(tmp_bucket)
            if objects is not None:
                delete_list = []
                for obj in objects:
                    file_key = urllib.parse.unquote_plus(obj["Key"],
                                                         encoding='utf-8')
                    key_splits = file_key.split("_")
                    key_epoch = key_splits[1]
                    key_batch = key_splits[2]
                    if key_epoch == str(curr_epoch) and key_batch == str(
                            curr_batch):
                        data = get_object(tmp_bucket, file_key).read()
                        bytes_data = np.frombuffer(data, dtype=vec_dtype)
                        tmp_vec = bytes_data.reshape(vec_shape)
                        merged_vec += tmp_vec
                        num_files += 1
                        delete_list.append(file_key)
                delete_objects(tmp_bucket, delete_list)
        # write the merged data back to s3
        merged_file_name = 'merged_' + postfix
        put_object(merged_bucket, merged_file_name, merged_vec.tobytes())
        delete_expired_merged_batch(merged_bucket, curr_epoch, curr_batch)
    else:
        merged_file_name = 'merged_' + postfix
        merged_data = get_object_or_wait(merged_bucket, merged_file_name,
                                         0.1).read()
        merged_vec = np.frombuffer(merged_data,
                                   dtype=vec_dtype).reshape(vec_shape)

    return merged_vec