Beispiel #1
0
def main(args):
    imgnt = imagenet.ImageNetData()
    with open(args.flickr_api_key_filename, 'r') as f:
        flickr_api_keys = json.load(f)
        api_key = flickr_api_keys[0]
        api_secret = flickr_api_keys[1]

    with open(args.wnids, 'r') as f:
        wnids = json.load(f)
    print('processing {} wnids'.format(len(wnids)))

    if not args.parallel:
        all_results = []
        for wnid in wnids:
            print("Flickr search for wnid {}".format(wnid))
            res = flickr_search_synset(imgnt, [wnid], api_key, api_secret,
                                       args)
            all_results += res
    else:
        pywren_config = wc.default()
        pywren_config["runtime"]["s3_bucket"] = "imagenet2datav2"
        pywren_config["runtime"][
            "s3_key"] = "pywren.runtime/pywren_runtime-3.6-imagenet2.tar.gz"
        pwex = pywren.default_executor(config=pywren_config)
        pywren_func = lambda x: flickr_search_synset(imgnt, x, api_key,
                                                     api_secret, args)
        pywren_args = list(
            utils.chunks(wnids,
                         int(np.ceil(len(wnids) / args.num_serial_tasks))))
        num_images_per_wnid = {}
        with open(
                '../data/metadata/flickr_' + args.min_date_uploaded + '_' +
                args.max_date_uploaded + '.json', 'r') as fp:
            num_images_per_wnid = json.load(fp)

        for ii, lst in enumerate(pywren_args):
            print("Map {} over {} wnids ".format(ii, len(lst)))
            unfinished_wnids = []
            for wnid in lst:
                if wnid not in num_images_per_wnid:
                    unfinished_wnids.append(wnid)
            print("Executing pywren call for {} wnids".format(
                len(unfinished_wnids)))
            futures = pwex.map(pywren_func, [[x] for x in unfinished_wnids])
            pywren.wait(futures)
            results = [f.result()[0] for f in futures]
            num_images = [f.result()[1] for f in futures]
            for ii, wnid in enumerate(unfinished_wnids):
                num_images_per_wnid[wnid] = num_images[ii]
            all_results = []
            for res in results:
                all_results += res
            with open(
                    '../data/metadata/flickr_' + args.min_date_uploaded + '_' +
                    args.max_date_uploaded + '.json', 'w') as fp:
                json.dump(num_images_per_wnid, fp, indent=2)
    print('Got {} results'.format(len(all_results)))
    current_date = str(datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'))
    out_file = '../data/search_results/' + current_date + '_' + getpass.getuser(
    ) + '.json'
    with open(out_file, 'w+') as fp:
        json.dump(all_results, fp, indent=2)
Beispiel #2
0
    def result(self,
               timeout=None,
               check_only=False,
               throw_except=True,
               storage_handler=None):
        """


        From the python docs:

        Return the value returned by the call. If the call hasn't yet
        completed then this method will wait up to timeout seconds. If
        the call hasn't completed in timeout seconds then a
        TimeoutError will be raised. timeout can be an int or float.If
        timeout is not specified or None then there is no limit to the
        wait time.

        If the future is cancelled before completing then CancelledError will be raised.

        If the call raised then this method will raise the same exception.

        """
        if self._state == JobState.new:
            raise ValueError("job not yet invoked")

        if self._state == JobState.success:
            return self._return_val

        if self._state == JobState.error:
            if throw_except:
                raise self._exception
            else:
                return None

        if storage_handler is None:
            storage_config = wrenconfig.extract_storage_config(
                wrenconfig.default())
            storage_handler = storage.Storage(storage_config)

        storage_utils.check_storage_path(storage_handler.get_storage_config(),
                                         self.storage_path)

        call_status = storage_handler.get_call_status(self.callset_id,
                                                      self.call_id)

        self.status_query_count += 1

        ## FIXME implement timeout
        if timeout is not None:
            raise NotImplementedError()

        if check_only is True:
            if call_status is None:
                return None

        while call_status is None:
            time.sleep(self.GET_RESULT_SLEEP_SECS)
            call_status = storage_handler.get_call_status(
                self.callset_id, self.call_id)

            self.status_query_count += 1
        self._invoke_metadata['status_done_timestamp'] = time.time()
        self._invoke_metadata['status_query_count'] = self.status_query_count

        self.run_status = call_status  # this is the remote status information
        self.invoke_status = self._invoke_metadata  # local status information

        if call_status['exception'] is not None:
            # the wrenhandler had an exception
            exception_str = call_status['exception']
            print(call_status)
            exception_args = call_status['exception_args']
            if exception_args[0] == "WRONGVERSION":
                if throw_except:
                    raise Exception("Pywren version mismatch: remote " + \
                        "expected version {}, local library is version {}".format(
                            exception_args[2], exception_args[3]))
                return None
            elif exception_args[0] == "OUTATIME":
                if throw_except:
                    raise Exception("process ran out of time")
                return None
            else:
                if throw_except:
                    if 'exception_traceback' in call_status:
                        logger.error(call_status['exception_traceback'])
                    raise Exception(exception_str, *exception_args)
                return None

        call_output_time = time.time()
        call_invoker_result = pickle.loads(
            storage_handler.get_call_output(self.callset_id, self.call_id))

        call_output_time_done = time.time()
        self._invoke_metadata[
            'download_output_time'] = call_output_time_done - call_output_time

        self._invoke_metadata[
            'download_output_timestamp'] = call_output_time_done
        call_success = call_invoker_result['success']
        logger.info("ResponseFuture.result() {} {} call_success {}".format(
            self.callset_id, self.call_id, call_success))

        self._call_invoker_result = call_invoker_result

        if call_success:

            self._return_val = call_invoker_result['result']
            self._state = JobState.success
            return self._return_val

        elif throw_except:

            self._exception = call_invoker_result['result']
            self._traceback = (call_invoker_result['exc_type'],
                               call_invoker_result['exc_value'],
                               call_invoker_result['exc_traceback'])

            self._state = JobState.error
            if call_invoker_result.get('pickle_fail', False):
                logging.warning(
                    "there was an error pickling. The original exception: " + \
                        "{}\nThe pickling exception: {}".format(
                            call_invoker_result['exc_value'],
                            str(call_invoker_result['pickle_exception'])))

                reraise(Exception, call_invoker_result['exc_value'],
                        call_invoker_result['exc_traceback'])
            else:
                # reraise the exception
                reraise(*self._traceback)
        else:
            return None  # nothing, don't raise, no value
def run_experiment(problem_size, shard_size, pipeline, num_priorities, lru,
                   eager, truncate, max_cores, start_cores, trial,
                   launch_granularity, timeout, log_granularity,
                   autoscale_policy, standalone, warmup, verify, matrix_exists,
                   read_limit, write_limit, compute_threads_per_worker):
    # set up logging
    invoke_executor = fs.ThreadPoolExecutor(1)
    logger = logging.getLogger()
    region = wc.default()["account"]["aws_region"]

    print("REGION", region)
    for key in logging.Logger.manager.loggerDict:
        logging.getLogger(key).setLevel(logging.CRITICAL)
    logger.setLevel(logging.DEBUG)
    arg_bytes = pickle.dumps(
        (problem_size, shard_size, pipeline, num_priorities, lru, eager,
         truncate, max_cores, start_cores, trial, launch_granularity, timeout,
         log_granularity, autoscale_policy, read_limit, write_limit))
    arg_hash = hashlib.md5(arg_bytes).hexdigest()
    log_file = "{0}.log".format(arg_hash)
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(ch)
    logger.info("Logging to {0}".format(log_file))
    if standalone:
        extra_env = {
            "AWS_ACCESS_KEY_ID": os.environ["AWS_ACCESS_KEY_ID"].strip(),
            "AWS_SECRET_ACCESS_KEY":
            os.environ["AWS_SECRET_ACCESS_KEY"].strip(),
            "OMP_NUM_THREADS": "1",
            "AWS_DEFAULT_REGION": region
        }
        config = wc.default()
        config['runtime']['s3_bucket'] = 'numpywrenpublic'
        key = "pywren.runtime/pywren_runtime-3.6-numpywren.tar.gz"
        config['runtime']['s3_key'] = key
        pwex = pywren.standalone_executor(config=config)
    else:
        extra_env = {"AWS_DEFAULT_REGION": region}
        config = wc.default()
        config['runtime']['s3_bucket'] = 'numpywrenpublic'
        key = "pywren.runtime/pywren_runtime-3.6-numpywren.tar.gz"
        config['runtime']['s3_key'] = key
        print(config)
        pwex = pywren.default_executor(config=config)

    if (not matrix_exists):
        X = np.random.randn(problem_size, 1)
        shard_sizes = [shard_size, 1]
        X_sharded = BigMatrix("qr_test_{0}_{1}".format(problem_size,
                                                       shard_size),
                              shape=X.shape,
                              shard_sizes=shard_sizes,
                              write_header=True,
                              autosqueeze=False,
                              bucket="numpywrennsdi")
        shard_matrix(X_sharded, X)
        print("Generating PSD matrix...")
        t = time.time()
        print(X_sharded.shape)
        XXT_sharded = binops.gemm(pwex,
                                  X_sharded,
                                  X_sharded.T,
                                  overwrite=False)
        e = time.time()
        print("GEMM took {0}".format(e - t))
    else:
        X_sharded = BigMatrix("qr_test_{0}_{1}".format(problem_size,
                                                       shard_size),
                              autosqueeze=False,
                              bucket="numpywrennsdi")
        key_name = binops.generate_key_name_binop(X_sharded, X_sharded.T,
                                                  "gemm")
        XXT_sharded = BigMatrix(key_name,
                                hash_keys=False,
                                bucket="numpywrensdi2")
    XXT_sharded.lambdav = problem_size * 10
    t = time.time()
    program, meta = bdfac(XXT_sharded, truncate=truncate)
    pipeline_width = args.pipeline
    if (lru):
        cache_size = 5
    else:
        cache_size = 0
    pywren_config = pwex.config
    e = time.time()
    print("Program compile took {0} seconds".format(e - t))
    print("program.hash", program.hash)
    REDIS_CLIENT = program.control_plane.client
    done_counts = []
    ready_counts = []
    post_op_counts = []
    not_ready_counts = []
    running_counts = []
    sqs_invis_counts = []
    sqs_vis_counts = []
    up_workers_counts = []
    busy_workers_counts = []
    read_objects = []
    write_objects = []
    all_read_timeouts = []
    all_write_timeouts = []
    all_redis_timeouts = []
    times = [time.time()]
    flops = [0]
    reads = [0]
    writes = [0]
    print("LRU", lru)
    print("eager", eager)
    exp = {}
    exp["redis_done_counts"] = done_counts
    exp["redis_ready_counts"] = ready_counts
    exp["redis_post_op_counts"] = post_op_counts
    exp["redis_not_ready_counts"] = not_ready_counts
    exp["redis_running_counts"] = running_counts
    exp["sqs_invis_counts"] = sqs_invis_counts
    exp["sqs_vis_counts"] = sqs_vis_counts
    exp["busy_workers"] = busy_workers_counts
    exp["up_workers"] = up_workers_counts
    exp["times"] = times
    exp["lru"] = lru
    exp["priority"] = num_priorities
    exp["eager"] = eager
    exp["truncate"] = truncate
    exp["max_cores"] = max_cores
    exp["problem_size"] = problem_size
    exp["shard_size"] = shard_size
    exp["pipeline"] = pipeline
    exp["flops"] = flops
    exp["reads"] = reads
    exp["writes"] = writes
    exp["read_objects"] = read_objects
    exp["write_objects"] = write_objects
    exp["read_timeouts"] = all_read_timeouts
    exp["write_timeouts"] = all_write_timeouts
    exp["redis_timeouts"] = all_redis_timeouts
    exp["trial"] = trial
    exp["launch_granularity"] = launch_granularity
    exp["log_granularity"] = log_granularity
    exp["autoscale_policy"] = autoscale_policy
    exp["standalone"] = standalone
    exp["program"] = program
    exp["time_steps"] = 1
    exp["failed"] = False

    program.start()
    t = time.time()
    logger.info("Starting with {0} cores".format(start_cores))
    all_futures = pwex.map(
        lambda x: job_runner.lambdapack_run(program,
                                            pipeline_width=pipeline_width,
                                            cache_size=cache_size,
                                            timeout=timeout),
        range(start_cores),
        extra_env=extra_env)
    start_time = time.time()
    last_run_time = start_time
    print(program.program_status())
    print("QUEUE URLS", len(program.queue_urls))
    total_lambda_epochs = start_cores
    try:
        while (program.program_status() == lp.PS.RUNNING):
            time.sleep(log_granularity)
            curr_time = int(time.time() - start_time)
            p = program.get_progress()
            if (p is None):
                print("no progress...")
                continue
            else:
                p = int(p)
            times.append(int(time.time()))
            max_pc = p
            waiting = 0
            running = 0
            for i, queue_url in enumerate(program.queue_urls):
                client = boto3.client('sqs')
                attrs = client.get_queue_attributes(
                    QueueUrl=queue_url,
                    AttributeNames=[
                        'ApproximateNumberOfMessages',
                        'ApproximateNumberOfMessagesNotVisible'
                    ])['Attributes']
                waiting += int(attrs["ApproximateNumberOfMessages"])
                running += int(attrs["ApproximateNumberOfMessagesNotVisible"])
            sqs_invis_counts.append(running)
            sqs_vis_counts.append(waiting)
            busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash))

            repeated_compute = parse_int(
                REDIS_CLIENT.get("{0}_repeated_compute".format(program.hash)))
            repeated_post_op = parse_int(
                REDIS_CLIENT.get("{0}_repeated_post_op".format(program.hash)))
            repeated_finish = parse_int(
                REDIS_CLIENT.get("{0}_repeated_finish".format(program.hash)))
            not_ready = parse_int(
                REDIS_CLIENT.get("{0}_not_ready".format(program.hash)))
            if (busy_workers == None):
                busy_workers = 0
            else:
                busy_workers = int(busy_workers)
            up_workers = program.get_up()

            if (up_workers == None):
                up_workers = 0
            else:
                up_workers = int(up_workers)
            up_workers_counts.append(up_workers)
            busy_workers_counts.append(busy_workers)

            logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                up_workers, busy_workers, curr_time))
            if ((curr_time % INFO_FREQ) == 0):
                logger.info("Waiting: {0}, Currently Processing: {1}".format(
                    waiting, running))
                logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                    up_workers, busy_workers, curr_time))

            current_gflops = program.get_flops()
            if (current_gflops is None):
                current_gflops = 0
            else:
                current_gflops = int(current_gflops) / 1e9

            flops.append(current_gflops)
            current_gbytes_read = program.get_read()
            if (current_gbytes_read is None):
                current_gbytes_read = 0
            else:
                current_gbytes_read = int(current_gbytes_read) / 1e9

            reads.append(current_gbytes_read)
            current_gbytes_write = program.get_write()
            if (current_gbytes_write is None):
                current_gbytes_write = 0
            else:
                current_gbytes_write = int(current_gbytes_write) / 1e9
            writes.append(current_gbytes_write)

            gflops_rate = flops[-1] / (times[-1] - times[0])
            greads_rate = reads[-1] / (times[-1] - times[0])
            gwrites_rate = writes[-1] / (times[-1] - times[0])
            b = XXT_sharded.shard_sizes[0]
            current_objects_read = (current_gbytes_read * 1e9) / (b * b * 8)
            current_objects_write = (current_gbytes_write * 1e9) / (b * b * 8)
            read_objects.append(current_objects_read)
            write_objects.append(current_objects_write)
            read_rate = read_objects[-1] / (times[-1] - times[0])
            write_rate = write_objects[-1] / (times[-1] - times[0])

            avg_workers = np.mean(up_workers_counts)
            smooth_len = 10
            if (len(flops) > smooth_len + 5):
                gflops_rate_5_min_window = (flops[-1] - flops[-smooth_len]) / (
                    times[-1] - times[-smooth_len])
                gread_rate_5_min_window = (reads[-1] - reads[-smooth_len]) / (
                    times[-1] - times[-smooth_len])
                gwrite_rate_5_min_window = (
                    writes[-1] - writes[-smooth_len]) / (times[-1] -
                                                         times[-smooth_len])
                read_rate_5_min_window = (read_objects[-1] -
                                          read_objects[-smooth_len]) / (
                                              times[-1] - times[-smooth_len])
                write_rate_5_min_window = (write_objects[-1] -
                                           write_objects[-smooth_len]) / (
                                               times[-1] - times[-smooth_len])
                workers_5_min_window = np.mean(up_workers_counts[-smooth_len:])
            else:
                gflops_rate_5_min_window = "N/A"
                gread_rate_5_min_window = "N/A"
                gwrite_rate_5_min_window = "N/A"
                workers_5_min_window = "N/A"
                read_rate_5_min_window = "N/A"
                write_rate_5_min_window = "N/A"

            read_timeouts = int(parse_int(
                REDIS_CLIENT.get("s3.timeouts.read")))
            write_timeouts = int(
                parse_int(REDIS_CLIENT.get("s3.timeouts.write")))
            redis_timeouts = int(parse_int(REDIS_CLIENT.get("redis.timeouts")))
            all_read_timeouts.append(read_timeouts)
            all_write_timeouts.append(write_timeouts)
            all_redis_timeouts.append(redis_timeouts)
            read_timeouts_fraction = read_timeouts / (current_objects_read +
                                                      1e-8)
            write_timeouts_fraction = write_timeouts / \
                (current_objects_write+1e-8)
            print("=======================================")
            print(
                f"Progress is {p}, Repeated Compute is {repeated_compute}, Repeated POST OP is {repeated_post_op}, Repeated Finishes is {repeated_finish}, Not ready Nodes scheduled are {not_ready}"
            )
            print("Max PC is {0}".format(max_pc))
            print("Waiting: {0}, Currently Processing: {1}".format(
                waiting, running))
            print("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                up_workers, busy_workers, curr_time))
            print(
                "{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}"
                .format(curr_time, current_gflops, current_gbytes_read,
                        current_gbytes_write))
            print(
                "{0}: Average GFLOPS rate {1}, Average GBytes Read rate {2}, Average GBytes Write  rate {3}, Average Worker Count {4}"
                .format(curr_time, gflops_rate, greads_rate, gwrites_rate,
                        avg_workers))
            print("{0}: Average read txns/s {1}, Average write txns/s {2}".
                  format(curr_time, read_rate, write_rate))
            print(
                "{0}: smoothed GFLOPS rate {1}, smoothed GBytes Read rate {2}, smoothed GBytes Write  rate {3}, smoothed Worker Count {4}"
                .format(curr_time, gflops_rate_5_min_window,
                        gread_rate_5_min_window, gwrite_rate_5_min_window,
                        workers_5_min_window))
            print("{0}: smoothed read txns/s {1}, smoothed write txns/s {2}".
                  format(curr_time, read_rate_5_min_window,
                         write_rate_5_min_window))
            print(
                "{0}: Read timeouts: {1}, Write timeouts: {2}, Redis timeouts: {3}  "
                .format(curr_time, read_timeouts, write_timeouts,
                        redis_timeouts))
            print(
                "{0}: Read timeouts fraction: {1}, Write timeouts fraction: {2}"
                .format(curr_time, read_timeouts_fraction,
                        write_timeouts_fraction))
            print("=======================================")

            time_since_launch = time.time() - last_run_time
            if (time_since_launch > (0.85 * timeout)):
                cores_to_launch = max_cores
                logger.info(
                    "launching {0} new tasks....".format(cores_to_launch))
                new_futures = pwex.map(lambda x: job_runner.lambdapack_run(
                    program,
                    pipeline_width=pipeline_width,
                    cache_size=cache_size,
                    timeout=timeout),
                                       range(cores_to_launch),
                                       extra_env=extra_env)
                #print("waiting for second result")
                #print("result..", new_futures[0].result())
                #print([x.result() for x in new_futures])

                last_run_time = time.time()
                all_futures.extend(new_futures)
            exp["time_steps"] += 1
    except KeyboardInterrupt:
        exp["failed"] = True
        program.stop()
        pass
    except Exception as e:
        traceback.print_exc()
        exp["failed"] = True
        program.stop()
        raise
        pass
    print(program.program_status())
    exp["all_futures"] = all_futures
    exp_bytes = dill.dumps(exp)
    client = boto3.client('s3')
    client.put_object(Key="lambdapack/{0}/runtime.pickle".format(program.hash),
                      Body=exp_bytes,
                      Bucket=program.bucket)
    print("=======================")
    print("=======================")
    print("Execution Summary:")
    print("Executed Program ID: {0}".format(program.hash))
    print("Program Success: {0}".format((not exp["failed"])))
    print("Problem Size: {0}".format(exp["problem_size"]))
    print("Shard Size: {0}".format(exp["shard_size"]))
    print("Total Execution time: {0}".format(times[-1] - times[0]))
    print("Average Flop Rate (GFlop/s): {0}".format(exp["flops"][-1] /
                                                    (times[-1] - times[0])))
    with open("/tmp/last_run", "w+") as f:
        f.write(program.hash)
import pywren
from pywren import wrenconfig as wc
import candidate_data
import utils

pywren_config = wc.default()
pywren_config["runtime"]["s3_bucket"] = "imagenet2pywren"
pywren_config["runtime"][
    "s3_key"] = "pywren.runtime/pywren_runtime-3.6-imagenet2pywren.meta.json"
pwex = pywren.default_executor(config=pywren_config)
print("pywren config", pwex.config)

c_data = candidate_data.CandidateData()

all_cs = c_data.all_candidates

chunked_cs = list(utils.chunks(list(all_cs.keys()), 100))


def return_not_exists(lst):
    ret_lst = []
    for e in lst:
        key = "{0}/{1}.jpg".format("imagenet2candidates_scaled", e)
        exists = utils.key_exists(bucket="imagenet2datav2", key=key)
        print(exists, key)
        if (not exists):
            ret_lst.append(e)
    return ret_lst


def return_not_exists_encrypted(lst):
Beispiel #5
0
def interactive_setup(ctx):
    '''
    Take the following setup
    1) First check if pywren works by running pywren test
    2) Create config file
    3) Create auxillary ``cron lambda"
       a) Check for ``unused redis"
    '''
    def ds(key):
        """
        Debug suffix for defaults. For automated testing,
        automatically adds a suffix to each default
        """
        return "{}{}".format(key, suffix)

    ok = click.confirm(NUMPYWREN_SETUP, default=True)
    if (not ok):
        return
    click.echo("Testing pywren is correctly installed...")
    try:
        test_pywren()
    except Exception as e:
        click.echo(
            "Looks like there is something wrong with your pywren setup. Please make sure the command\
                    pywren test_function returns sucessfully")
        raise
    pywren_config = wrenconfig.default()
    pywren_bucket = pywren_config["s3"]["bucket"]
    # if config file exists, ask before overwriting
    config_filename = click_validate_prompt(
        "Location for config file: ",
        default=npw.config.get_default_home_filename())

    overwrite = check_overwrite_function(config_filename)
    config_filename = os.path.expanduser(config_filename)

    s3_bucket = click_validate_prompt(
        "numpywren requires an s3 bucket to store all data. " + \
            "What s3 bucket would you like to use?",
        default=pywren_bucket,
        validate_func=check_valid_bucket_name)
    create_bucket = False
    if not check_bucket_exists(s3_bucket):
        create_bucket = click.confirm(
            "Bucket does not currently exist, would you like to create it?",
            default=True)

    click.echo(
        "numpywren prefixes every object it puts in S3 with a particular prefix."
    )
    prefix = click_validate_prompt("numpywren s3 prefix: ",
                                   default=npw.config.AWS_S3_PREFIX_DEFAULT)
    if (overwrite):
        default_yaml = yaml.safe_load(
            open(os.path.join(SOURCE_DIR, "../default_config.yaml")))
    else:
        default_yaml = yaml.safe_load(open(config_filename))

    default_yaml["s3"]["bucket"] = s3_bucket
    default_yaml["s3"]["prefix"] = prefix
    default_yaml["iam"]["role_name"] = npw.config.AWS_ROLE_DEFAULT
    default_yaml["iam"][
        "instance_profile_name"] = npw.config.AWS_INSTANCE_PROFILE_DEFAULT
    try:
        ec2_client = boto3.client('ec2')
        response = ec2_client.describe_key_pairs()
        key_pairs = [x['KeyName'] for x in response["KeyPairs"]]
        key_pair = key_pairs[0]
    except:
        raise
        click.echo(
            "Error in acquiring ec2 key pair, perhaps you don't have any setup?"
        )
        return

    default_yaml["control_plane"]["ec2_ssh_key"] = key_pair
    config_advanced = click.confirm(
        "Would you like to configure advanced numpywren properties?",
        default=False)
    if (config_advanced):
        lifespan = int(
            click_validate_prompt(
                "How many days would you like numpywren to temporarily store data on S3 (default is 1 day, which translates to roughly $0.72 per TB)",
                default=default_yaml["s3"]["lifespan"],
                validate_func=check_valid_lifespan))
        default_yaml["s3"]["lifespan"] = lifespan

        runtime_bucket = click_validate_prompt(
            "Which bucket would you like pywren to load the python runtime from",
            default=default_yaml["runtime"]["bucket"],
            validate_func=check_valid_bucket_name)
        runtime_key = click_validate_prompt(
            "What is the runtime key in above bucket",
            default=default_yaml["runtime"]["s3_key"])
        default_yaml["runtime"]["bucket"] = runtime_bucket
        default_yaml["runtime"]["s3_key"] = runtime_key
        role_name = click_validate_prompt(
            "What would you like to name the numpywren iam role which will allow numpywren executors to access your AWS resources",
            default=default_yaml["iam"]["role_name"])
        default_yaml["iam"]["role_name"] = role_name
        instance_profile_name = click_validate_prompt(
            "What would you like to name the numpywren iam instance profile which will allow numpywren executors to access your AWS resources",
            default=default_yaml["iam"]["instance_profile_name"])
        default_yaml["iam"]["instance_profile_name"] = instance_profile_name
        ec2_ssh_key = click_validate_prompt(
            "Pick a valid ec2 ssh key pair",
            default=default_yaml["control_plane"]["ec2_ssh_key"])
        default_yaml["control_plane"]["ec2_ssh_key"] = ec2_ssh_key
    else:
        role_name = default_yaml["iam"]["role_name"]
        instance_profile_name = default_yaml["iam"]["instance_profile_name"]

    create_role(default_yaml, role_name)
    create_instance_profile(default_yaml, instance_profile_name)
    lifespan = default_yaml["s3"]["lifespan"]
    s3Client = boto3.client('s3')
    s3Client.put_bucket_lifecycle_configuration(Bucket=s3_bucket,
                                                LifecycleConfiguration={
                                                    'Rules': [
                                                        {
                                                            'Status':
                                                            'Enabled',
                                                            'Expiration': {
                                                                'Days':
                                                                lifespan
                                                            },
                                                            'Filter': {
                                                                'Prefix':
                                                                prefix
                                                            }
                                                        },
                                                    ]
                                                })
    open(config_filename,
         "w+").write(yaml.dump(default_yaml, default_flow_style=False))
Beispiel #6
0
import botocore
import cloudpickle
import numpy as np
import pywren.wrenconfig as wc
import dill
from collections import defaultdict

from . import matrix_utils
from .matrix_utils import list_all_keys, block_key_to_block, get_local_matrix, key_exists_async
from . import utils

cpu_count = multiprocessing.cpu_count()
logger = logging.getLogger('numpywren')

try:
    DEFAULT_BUCKET = wc.default()['s3']['bucket']
    DEFAULT_REGION = wc.default()['account']['aws_region']
except Exception as e:
    DEFAULT_BUCKET = ""
    DEFAULT_REGION = ""


class BigMatrix(object):
    """
    A multidimensional array stored in S3, sharded in blocks of a given size.

    Parameters
    ----------
    key : string
        The S3 key to store this matrix at.
    shape : tuple of int, optional
Beispiel #7
0
def _wait(fs, return_early_n, max_direct_query_n,
          random_query=False, THREADPOOL_SIZE=16):
    """
    internal function that performs the majority of the WAIT task
    work.

    For the list of futures fn, we will check at a minimum `max_direct_query_n`
    futures at least once. Internally we :
    1. use list() to quickly get a list of which ones are done (but
    list can be behind due to eventual consistency issues)
    2. then individually call get_status on at most `max_direct_query_n` returning
       early if we have found at least `return_early_n`

    This can mitigate the stragglers.

    random_query decides whether we get the fs in the order they are presented
    or in a random order.
    """


    # get all the futures that are not yet done
    not_done_futures = [f for f in fs if f._state not in [JobState.success,
                                                          JobState.error]]
    if len(not_done_futures) == 0:
        return fs, []


    storage_config = wrenconfig.extract_storage_config(wrenconfig.default())
    storage_handler = storage.Storage(storage_config)

    ### Callset optimization via object store convenience functions:
    # check if the not-done ones have the same callset_id
    present_callsets = {f.callset_id for f in not_done_futures}

    # get the list of all objects in this callset
    still_not_done_futures = []
    while present_callsets:
        callset_id = present_callsets.pop()

        # note this returns everything done, so we have to figure out
        # the intersection of those that are done
        callids_done_in_callset = set(storage_handler.get_callset_status(callset_id))

        not_done_call_ids = {f.call_id for f in not_done_futures} 

        done_call_ids = not_done_call_ids.intersection(callids_done_in_callset)
        not_done_call_ids = not_done_call_ids - done_call_ids

        still_not_done_futures += [f for f in not_done_futures if (f.call_id in not_done_call_ids)]

    def fetch_future_status(f):
        return storage_handler.get_call_status(f.callset_id, f.call_id)


    pool = ThreadPool(THREADPOOL_SIZE)

    # now try up to max_direct_query_n direct status queries, quitting once
    # we have return_n done.
    query_count = 0
    max_queries = min(max_direct_query_n, len(still_not_done_futures))

    if random_query:
        random.shuffle(still_not_done_futures)

    while query_count < max_queries:

        if len(done_call_ids) >= return_early_n:
            break
        num_to_query_at_once = THREADPOOL_SIZE
        fs_to_query = still_not_done_futures[query_count:query_count + num_to_query_at_once]

        fs_statuses = pool.map(fetch_future_status, fs_to_query)

        callids_found = [fs_to_query[i].call_id for i in range(len(fs_to_query))
                         if (fs_statuses[i] is not None)]
        done_call_ids = done_call_ids.union(set(callids_found))

        # # update done call_ids
        # callids_done.update(callids_found)

        # # break if not all N tasks completed
        # if (len(callids_found) < len(fs_samples)):
        #     break
        # # calculate new still_not_done_futures
        # still_not_done_futures = [f for f in not_done_futures if (f.call_id not in callids_done)]
        query_count += len(fs_to_query)


    # now we walk through all the original queries and get
    # the ones that are actually done.
    fs_dones = []
    fs_notdones = []

    f_to_wait_on = []
    for f in fs:
        if f._state in [JobState.success, JobState.error]:
            # done, don't need to do anything
            fs_dones.append(f)
        else:
            if f.call_id in done_call_ids:
                f_to_wait_on.append(f)
                fs_dones.append(f)
            else:
                fs_notdones.append(f)
    def get_result(f):
        f.result(throw_except=False, storage_handler=storage_handler)

    pool.map(get_result, f_to_wait_on)

    pool.close()
    pool.join()

    return fs_dones, fs_notdones
Beispiel #8
0
import pywren
from pywren.serialize import serialize
import pywren.wrenconfig as wc
import sympy
import redis
import scipy.linalg
import dill
import redis.exceptions
import logging
from .matrix import BigMatrix
from .matrix_utils import load_mmap, chunk, generate_key_name_uop, generate_key_name_binop, constant_zeros
from . import control_plane, matrix
from . import utils

try:
    DEFAULT_CONFIG = wc.default()
except:
    DEFAULT_CONFIG = {}

logger = logging.getLogger(__name__)


class RemoteInstructionOpCodes(Enum):
    S3_LOAD = 0
    S3_WRITE = 1
    GENERIC = 3
    RET = 4


class NodeStatus(Enum):
    NOT_READY = 0
Beispiel #9
0
def dummy_executor(config=None, job_max_runtime=300):
    if config is None:
        config = wrenconfig.default()

    invoker = invokers.DummyInvoker()
    return Executor(invoker, config, job_max_runtime)
Beispiel #10
0
def run_experiment(problem_size, shard_size, pipeline, priority, lru, eager,
                   truncate, max_cores, start_cores, trial, launch_granularity,
                   timeout, log_granularity, autoscale_policy, standalone):
    # set up logging
    logger = logging.getLogger()
    for key in logging.Logger.manager.loggerDict:
        logging.getLogger(key).setLevel(logging.CRITICAL)
    logger.setLevel(logging.DEBUG)
    arg_bytes = pickle.dumps(
        (problem_size, shard_size, pipeline, priority, lru, eager, truncate,
         max_cores, start_cores, trial, launch_granularity, timeout,
         log_granularity, autoscale_policy))
    arg_hash = hashlib.md5(arg_bytes).hexdigest()
    log_file = "optimization_experiments/{0}.log".format(arg_hash)
    fh = logging.FileHandler(log_file)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO)
    ch.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(ch)
    logger.info("Logging to {0}".format(log_file))

    X = np.random.randn(problem_size, 1)
    if standalone:
        redis_env = {
            "REDIS_ADDR": os.environ.get("REDIS_ADDR", ""),
            "REDIS_PASS": os.environ.get("REDIS_PASS", ""),
            "AWS_ACCESS_KEY_ID": "AKIAIV3ENRQOI3FET2YA",
            "AWS_SECRET_ACCESS_KEY":
            "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",
            "OMP_NUM_THREADS": "1"
        }
        config = wc.default()
        config['runtime']['s3_bucket'] = 'pictureweb'
        config['runtime'][
            's3_key'] = 'pywren.runtime/pywren_runtime-3.6-numpywren_avx512.tar.gz'
        pwex = pywren.standalone_executor(config=config)
    else:
        redis_env = {
            "REDIS_ADDR": os.environ.get("REDIS_ADDR", ""),
            "REDIS_PASS": os.environ.get("REDIS_PASS", "")
        }
        config = wc.default()
        config['runtime']['s3_bucket'] = 'pictureweb'
        config['runtime'][
            's3_key'] = 'pywren.runtime/pywren_runtime-3.6-numpywren.tar.gz'
        pwex = pywren.default_executor(config=config)

    shard_sizes = [shard_size, 1]
    X_sharded = BigMatrix("cholesky_test_{0}_{1}".format(
        problem_size, shard_size),
                          shape=X.shape,
                          shard_sizes=shard_sizes,
                          write_header=True)
    shard_matrix(X_sharded, X)
    print("Generating PSD matrix...")
    t = time.time()
    XXT_sharded = binops.gemm(pwex, X_sharded, X_sharded.T, overwrite=False)
    e = time.time()
    print("GEMM took {0}".format(e - t))
    XXT_sharded.lambdav = problem_size * 10
    instructions, L_sharded, trailing = lp._chol(XXT_sharded)
    pipeline_width = args.pipeline
    if (priority):
        num_priorities = 5
    else:
        num_priorities = 1
    if (lru):
        cache_size = 5
    else:
        cache_size = 0

    REDIS_CLIENT = redis.StrictRedis(REDIS_ADDR,
                                     port=REDIS_PORT,
                                     password=REDIS_PASS,
                                     db=0,
                                     socket_timeout=5)

    if (truncate is not None):
        instructions = instructions[:truncate]
    config = pwex.config

    program = lp.LambdaPackProgram(instructions,
                                   executor=pywren.lambda_executor,
                                   pywren_config=config,
                                   num_priorities=num_priorities,
                                   eager=eager)

    done_counts = []
    ready_counts = []
    post_op_counts = []
    not_ready_counts = []
    running_counts = []
    sqs_invis_counts = []
    sqs_vis_counts = []
    up_workers_counts = []
    busy_workers_counts = []
    times = []
    flops = []
    reads = []
    writes = []
    print("LRU", lru)
    print("eager", eager)
    exp = {}
    exp["redis_done_counts"] = done_counts
    exp["redis_ready_counts"] = ready_counts
    exp["redis_post_op_counts"] = post_op_counts
    exp["redis_not_ready_counts"] = not_ready_counts
    exp["redis_running_counts"] = running_counts
    exp["sqs_invis_counts"] = sqs_invis_counts
    exp["sqs_vis_counts"] = sqs_vis_counts
    exp["busy_workers"] = busy_workers_counts
    exp["up_workers"] = up_workers_counts
    exp["times"] = times
    exp["lru"] = lru
    exp["priority"] = priority
    exp["eager"] = eager
    exp["truncate"] = truncate
    exp["max_cores"] = max_cores
    exp["problem_size"] = problem_size
    exp["shard_size"] = shard_size
    exp["pipeline"] = pipeline
    exp["flops"] = flops
    exp["reads"] = reads
    exp["writes"] = writes
    exp["trial"] = trial
    exp["launch_granularity"] = launch_granularity
    exp["log_granularity"] = log_granularity
    exp["autoscale_policy"] = autoscale_policy
    exp["standalone"] = standalone

    logger.info("Longest Path: {0}".format(program.longest_path))
    program.start()
    t = time.time()
    logger.info("Starting with {0} cores".format(start_cores))
    all_futures = pwex.map(
        lambda x: job_runner.lambdapack_run(program,
                                            pipeline_width=pipeline_width,
                                            cache_size=cache_size,
                                            timeout=timeout),
        range(start_cores),
        extra_env=redis_env)
    # print([f.result() for f in all_futures])
    start_time = time.time()
    last_run_time = start_time

    while (program.program_status() == lp.PS.RUNNING):
        curr_time = int(time.time() - start_time)
        max_pc = program.get_max_pc()
        times.append(int(time.time()))
        time.sleep(log_granularity)
        waiting = 0
        running = 0
        for i, queue_url in enumerate(program.queue_urls):
            client = boto3.client('sqs')
            attrs = client.get_queue_attributes(
                QueueUrl=queue_url,
                AttributeNames=[
                    'ApproximateNumberOfMessages',
                    'ApproximateNumberOfMessagesNotVisible'
                ])['Attributes']
            waiting += int(attrs["ApproximateNumberOfMessages"])
            running += int(attrs["ApproximateNumberOfMessagesNotVisible"])
        sqs_invis_counts.append(running)
        sqs_vis_counts.append(waiting)
        busy_workers = REDIS_CLIENT.get("{0}_busy".format(program.hash))
        if (busy_workers == None):
            busy_workers = 0
        else:
            busy_workers = int(busy_workers)
        up_workers = program.get_up()

        if (up_workers == None):
            up_workers = 0
        else:
            up_workers = int(up_workers)
        up_workers_counts.append(up_workers)
        busy_workers_counts.append(busy_workers)

        logger.debug("Waiting: {0}, Currently Processing: {1}".format(
            waiting, running))
        logger.debug("{2}: Up Workers: {0}, Busy Workers: {1}".format(
            up_workers, busy_workers, curr_time))
        if ((curr_time % INFO_FREQ) == 0):
            logger.info("Max PC is {0}".format(max_pc))
            logger.info("Waiting: {0}, Currently Processing: {1}".format(
                waiting, running))
            logger.info("{2}: Up Workers: {0}, Busy Workers: {1}".format(
                up_workers, busy_workers, curr_time))

        #print("{5}: Not Ready: {0}, Ready: {1}, Running: {4}, Post OP: {2},  Done: {3}".format(not_ready_count, ready_count, post_op_count, done_count, running_count, curr_time))
        current_gflops = program.get_flops()
        if (current_gflops is None):
            current_gflops = 0
        else:
            current_gflops = int(current_gflops) / 1e9

        flops.append(current_gflops)
        current_gbytes_read = program.get_read()
        if (current_gbytes_read is None):
            current_gbytes_read = 0
        else:
            current_gbytes_read = int(current_gbytes_read) / 1e9

        reads.append(current_gbytes_read)
        current_gbytes_write = program.get_write()
        if (current_gbytes_write is None):
            current_gbytes_write = 0
        else:
            current_gbytes_write = int(current_gbytes_write) / 1e9
        writes.append(current_gbytes_write)
        #print("{0}: Total GFLOPS {1}, Total GBytes Read {2}, Total GBytes Write {3}".format(curr_time, current_gflops, current_gbytes_read, current_gbytes_write))

        time_since_launch = time.time() - last_run_time
        if (autoscale_policy == "dynamic"):
            if (time_since_launch > launch_granularity
                    and up_workers < np.ceil(waiting * 0.5 / pipeline_width)
                    and up_workers < max_cores):
                cores_to_launch = int(
                    min(
                        np.ceil(waiting / pipeline_width) - up_workers,
                        max_cores - up_workers))
                logger.info(
                    "launching {0} new tasks....".format(cores_to_launch))
                new_futures = pwex.map(lambda x: job_runner.lambdapack_run(
                    program,
                    pipeline_width=pipeline_width,
                    cache_size=cache_size,
                    timeout=timeout),
                                       range(cores_to_launch),
                                       extra_env=redis_env)
                last_run_time = time.time()
                # check if we OOM-erred
                # [x.result() for x in all_futures]
                all_futures.extend(new_futures)
        elif (autoscale_policy == "constant_timeout"):
            if (time_since_launch > (0.99 * timeout)):
                cores_to_launch = max_cores
                logger.info(
                    "launching {0} new tasks....".format(cores_to_launch))
                new_futures = pwex.map(lambda x: job_runner.lambdapack_run(
                    program,
                    pipeline_width=pipeline_width,
                    cache_size=cache_size,
                    timeout=timeout),
                                       range(cores_to_launch),
                                       extra_env=redis_env)
                last_run_time = time.time()
                # check if we OOM-erred
                # [x.result() for x in all_futures]
                all_futures.extend(new_futures)
        else:
            raise Exception("unknown autoscale policy")

    exp["all_futures"] = all_futures
    doubles = 0

    for pc in range(program.num_inst_blocks):
        run_count = REDIS_CLIENT.get("{0}_{1}_start".format(program.hash, pc))
        if (run_count is None):
            run_count = 0
        else:
            run_count = int(run_count)

        if (run_count != 1):
            logger.warn("PC: {0}, Run Count: {1}".format(pc, run_count))
            doubles += 1

    print("Number of repeats: {0}".format(doubles))
    e = time.time()
    time.sleep(10)
    logger.info(program.program_status())
    logger.info("PROGRAM STATUS " + str(program.program_status()))
    logger.info("PROGRAM HASH " + str(program.hash))
    logger.info("Took {0} seconds".format(e - t))
    # collect in
    executor = fs.ThreadPoolExecutor(72)
    futures = []
    for i in range(0, program.num_inst_blocks, 1):
        futures.append(executor.submit(program.get_profiling_info, i))
    res = fs.wait(futures)
    profiled_blocks = [f.result() for f in futures]
    serializer = serialize.SerializeIndependent()
    byte_string = serializer([profiled_blocks])[0][0]
    exp["profiled_block_pickle_bytes"] = byte_string

    read, write, total_flops, bins, instructions, runtimes = lp.perf_profile(
        profiled_blocks, num_bins=100)
    flop_rate = sum(total_flops) / max(bins)
    exp["flop_rate"] = flop_rate
    print("Average Flop rate of {0}".format(flop_rate))
    # save other stuff
    try:
        os.mkdir("optimization_experiments/")
    except FileExistsError:
        pass
    exp_bytes = pickle.dumps(exp)
    dump_path = "optimization_experiments/{0}.pickle".format(arg_hash)
    print("Dumping experiment pickle to {0}".format(dump_path))
    with open(dump_path, "wb+") as f:
        f.write(exp_bytes)
Beispiel #11
0
def launch_and_provision_redis(config=None):
    if (config == None):
        config = npw.config.default()
    pywren_config = wc.default()
    rc = config["control_plane"]
    port = rc["port"]
    spot_price = rc["spot_price"]
    password = rc["password"]
    ipn = config["iam"]["instance_profile_name"]
    ami = rc["target_ami"]
    instance_type = rc["ec2_instance_type"]
    # TODO fix
    key_name = config["control_plane"]["ec2_ssh_key"]
    aws_region = pywren_config['account']['aws_region']
    availability_zone = rc.get("availability_zone", None)
    redis_conf = open(sd("redis.conf")).read()
    template_file = sd("redis.cloudinit.template")
    user_data = open(template_file, 'r').read()
    cloud_agent_conf = open(sd("cloudwatch-agent.config"), 'r').read()
    cloud_agent_conf_64 = b64s(cloud_agent_conf)
    redis_conf_b64 = b64s(redis_conf.format(port=port, password=password))
    redis_init_b64 = b64s(
        open(sd("redis_init_script")).read().format(port=port))
    user_data = user_data.format(redis_init=redis_init_b64,
                                 cloud_agent_conf=cloud_agent_conf_64,
                                 redis_conf=redis_conf_b64,
                                 aws_region=aws_region)
    iam = boto3.resource('iam')
    instance_profile = iam.InstanceProfile(ipn)
    instance_profile_dict = {'Name': instance_profile.name}
    group_id = create_security_group()
    instances = _create_instances(1,
                                  aws_region,
                                  spot_price,
                                  ami=ami,
                                  instance_type=instance_type,
                                  block_device_mappings=None,
                                  security_group_ids=[group_id],
                                  ebs_optimized=True,
                                  availability_zone=None,
                                  instance_profile=instance_profile_dict,
                                  user_data=user_data,
                                  key_name=key_name)
    inst = instances[0]
    inst.reload()
    inst.create_tags(Resources=[inst.instance_id],
                     Tags=[
                         {
                             'Key': 'Name',
                             'Value': 'numpywren.control_plane'
                         },
                     ])
    host = inst.public_ip_address
    info = {
        'id': inst.id,
        'type': inst.instance_type,
        'private_ip': inst.private_ip_address,
        'public_ip': inst.public_ip_address,
    }
    set_control_plane(info, config)
    return info
Beispiel #12
0
    def result(self, timeout=None, check_only=False,
               throw_except=True, storage_handler=None):
        """

        check_only = True implies we only check if the job is completed.

        # FIXME check_only is the worst API and should be refactored
        # out to be part of done()

        From the python docs:

        Return the value returned by the call. If the call hasn't yet
        completed then this method will wait up to timeout seconds. If
        the call hasn't completed in timeout seconds then a
        TimeoutError will be raised. timeout can be an int or float. If
        timeout is not specified or None then there is no limit to the
        wait time.

        Return the value returned by the call.
        If the call raised an exception, this method will raise the same exception
        If the future is cancelled before completing then CancelledError will be raised.

        :param timeout: This method will wait up to timeout seconds before raising
            a TimeoutError if function hasn't completed. If None, wait indefinitely. Default None.
        :param check_only: Return None immediately if job is not complete. Default False.
        :param throw_except: Reraise exception if call raised. Default true.
        :param storage_handler: Storage handler to poll cloud storage. Default None.
        :return: Result of the call.
        :raises CancelledError: If the job is cancelled before completed.
        :raises TimeoutError: If job is not complete after `timeout` seconds.

        """
        if self._state == JobState.new:
            raise ValueError("job not yet invoked")

        if check_only:
            if self._state == JobState.success or self._state == JobState.error:
                return True

        if self._state == JobState.success:
            return self._return_val

        if self._state == JobState.error:
            if throw_except:
                print("Encountered exception: {}".format(self._exception))
                raise self._exception
            else:
                return None

        if storage_handler is None:
            storage_config = wrenconfig.extract_storage_config(wrenconfig.default())
            storage_handler = storage.Storage(storage_config)

        storage_utils.check_storage_path(storage_handler.get_storage_config(), self.storage_path)


        call_status = storage_handler.get_call_status(self.callset_id, self.call_id)

        self.status_query_count += 1

        ## FIXME implement timeout
        if timeout is not None:
            raise NotImplementedError()

        if check_only:
            if call_status is None:
                return False
            else:
                return True

        while call_status is None:
            time.sleep(self.GET_RESULT_SLEEP_SECS)
            call_status = storage_handler.get_call_status(self.callset_id, self.call_id)

            self.status_query_count += 1
        self._invoke_metadata['status_done_timestamp'] = time.time()
        self._invoke_metadata['status_query_count'] = self.status_query_count

        self.run_status = call_status # this is the remote status information
        self.invoke_status = self._invoke_metadata # local status information

        if call_status['exception'] is not None:
            # the wrenhandler had an exception
            exception_str = call_status['exception']

            exception_args = call_status['exception_args']
            if exception_args[0] == "WRONGVERSION":
                if throw_except:
                    raise Exception("Pywren version mismatch: remote " + \
                        "expected version {}, local library is version {}".format(
                            exception_args[2], exception_args[3]))
                return None
            elif exception_args[0] == "OUTATIME":
                if throw_except:
                    raise Exception("process ran out of time")
                return None
            elif exception_args[0] == "CANCELLED":
                if throw_except:
                    raise Exception("job was cancelled")
            elif exception_args[0] == "RETCODE":
                if throw_except:
                    raise Exception("python process failed, returned a non-zero return code"
                                    "(check stdout for information)")
                return None
            else:
                if throw_except:
                    if 'exception_traceback' in call_status:
                        logger.error(call_status['exception_traceback'])
                    raise Exception(exception_str, *exception_args)
                return None

        # FIXME this shouldn't be called if check_only is True
        call_output_time = time.time()
        call_invoker_result = pickle.loads(storage_handler.get_call_output(
            self.callset_id, self.call_id))

        call_output_time_done = time.time()
        self._invoke_metadata['download_output_time'] = call_output_time_done - call_output_time

        self._invoke_metadata['download_output_timestamp'] = call_output_time_done
        call_success = call_invoker_result['success']
        logger.info("ResponseFuture.result() {} {} call_success {}".format(self.callset_id,
                                                                           self.call_id,
                                                                           call_success))



        self._call_invoker_result = call_invoker_result

        if call_success:

            self._return_val = call_invoker_result['result']
            self._set_state(JobState.success)
            return self._return_val
        else:
            self._set_state(JobState.error)
            self._exception = call_invoker_result['result']
            self._traceback = (call_invoker_result['exc_type'],
                               call_invoker_result['exc_value'],
                               call_invoker_result['exc_traceback'])
            print("Exception: {}\nTraceback: {}\n{}\n{}".format(self._exception, str(self._traceback[0]),
                                                                                 str(self._traceback[1]),
                                                                                 str(call_invoker_result['exc_traceback'])))
            if "exec_traceback_formatted" in call_invoker_result:
                print("exec_traceback_formatted:\n{}".format(call_invoker_result["exec_traceback_formatted"]))
            print("call_invoker_result: {}".format(call_invoker_result))                                                                           
            if throw_except:
                if call_invoker_result.get('pickle_fail', False):
                    logging.warning(
                        "there was an error pickling. The original exception: " + \
                            "{}\nThe pickling exception: {}".format(
                                call_invoker_result['exc_value'],
                                str(call_invoker_result['pickle_exception'])))

                    reraise(Exception, call_invoker_result['exc_value'],
                            call_invoker_result['exc_traceback'])
                else:
                    # reraise the exception
                    reraise(*self._traceback)
            else:
                return None  # nothing, don't raise, no value