Ejemplo n.º 1
0
def copydata_prepareEnvironment(clusters, src, localcopy=False):
    """
    This function checks if the dest cluster contains the data from the src cluster.
    If so, no data is copied. Otherwise, the data is copied and verified by digetst

    clusters is a tuple with (cluster_from_data_is_copied, cluster_to_data_is_copied)
    namenodes is a tuple with (namenode_from_data_is_copied, namenode_to_data_is_copied)
    src is from directory
    dest is to directory
    """

    if localcopy:
        result1 = exists(src)
    else:
        from_cluster = clusters[0]
        f = my_apply_async(exists, queue=from_cluster, args=(src,))
        result1 = my_get(f)

    if not result1:
        raise Exception("Source directory doesn't exist")

    # new version
    to_cluster = clusters[1]
    logging.info("Preparing environment in cluster %s: %s" % (to_cluster, src))
    f = my_apply_async(exists, queue=to_cluster, args=(src,))
    result2 = my_get(f)

    if result2:
        raise Exception("Target directory exists")

    return result1, result2
Ejemplo n.º 2
0
def get_prediction_metrics(host_spec, pinput):
    """
    rank jobs using linear regression

    :param host_spec tuple with included_cluster and excluded_clusters
    :param pinput (string) input path

    """
    included_clusters, excluded_clusters = host_spec

    included_lat_list = []
    included_jobs_list = []
    """ get metrics related to job """
    for cluster in included_clusters:
        f_list = [
            my_apply_async(get_total_size,
                           queue=cluster.cluster,
                           args=(path, )) for path in pinput
        ]
        input_size = sum(f.get() for f in f_list)

        # included_lat_list += get_prediction_on_network(cluster.cluster, excluded_clusters)
        predicted_time = get_prediction_metrics_on_job(cluster.cluster,
                                                       input_size)
        penalization = json.loads(
            my_apply_async(load_penalization, queue=cluster.cluster).get())
        path_statistics = PathStatistics(
            cluster.cluster, predicted_time + penalization["error"], path)
        logging.info("Prediction: %s", path_statistics.__str__())
        included_jobs_list.append(path_statistics)

    included_jobs_list.sort(key=lambda x: x.rank, reverse=False)
    excluded_jobs_list = []
    for cluster in excluded_clusters:
        predicted_time = get_prediction_metrics_on_job(cluster.cluster,
                                                       input_size)
        excluded_jobs_list.append(
            PathStatistics(cluster.cluster, predicted_time, path))

    logging.info("Prediction on network: %s" % included_lat_list)

    logging.info("Prediction on job (included): %s" %
                 "; ".join(str(job) for job in included_jobs_list))
    logging.info("Prediction on job (excluded): %s" %
                 "; ".join(str(job) for job in excluded_jobs_list))

    # rank = _get_prediction_metrics_on_all_clusters(included_clusters, excluded_clusters,
    #                                                included_lat_list, included_jobs_list, excluded_jobs_list)
    rank_list = (included_jobs_list, excluded_jobs_list)

    return rank_list
Ejemplo n.º 3
0
def copydata_getDigests(cluster, path):
    """ generate digests from the given path and returns the value

    :param cluster (string)
    :param path (string) source path

    :return dict with (filename: digest)
    """

    # new version
    f = my_apply_async(generate_digests, queue=cluster, args=(path, medusa_settings.digest_command,))
    while not f.ready():
        time.sleep(5)

    digests = f.get()

    return digests
Ejemplo n.º 4
0
def _copydata_distcp(clusters, src, dest):
    """
    Copy the data between HDFS instances.

    :param clusters: (tuple) with source and dest hosts
    :param src: (string) source path
    :param dest: (string) dest path
    :return: (int) the time that it took to copy data.
    """
    start = time.time()
    f = my_apply_async(distCp, queue=clusters[0], args=(clusters, src, dest,))

    while not f.ready():
        logging.debug("Waiting for response from %s" % (clusters[0]))
        time.sleep(10)

    f.get()
    end = time.time()

    span = end - start

    return span
Ejemplo n.º 5
0
def run_job(execution_param, queue=None):
    """
    execute a job.

    :param execution_param (ExecutionJob) object used to prepare the execution of the job.
    """

    clusters = execution_param.clusters
    how_many_runs = execution_param.how_many_runs
    command = execution_param.command
    output_path = execution_param.output_path

    executors = []
    execution_time = defaultdict(int)
    try:
        sstart = time.time()
        for idx, cluster in enumerate(clusters):
            if idx < how_many_runs:
                # only launches maximum 2 jobs (2 is the majority)
                execution_time[cluster] = time.time()

                # execute the job
                logging.info("Executing job at %s " % cluster)
                g1 = group(
                    executeCommand.s(
                        command,
                        1,
                    ).set(queue=cluster),
                    get_queue_info.s().set(queue=cluster))
                executors.append(JobExecution(g1(), cluster, sstart))
    except:
        logging.error(str(traceback.format_exc()))

    json_results = []
    failed_exec = []
    for executor in executors:
        _exec = executor.get_execution()
        cluster = executor.get_cluster()

        try:
            """ waiting for a task to finish """
            waiting(cluster, _exec)
            _output = _exec.get()
        except Exception:
            failed_exec.append(execution_param)
            continue

        job_output, queue_info = _output

        makespan = time.time() - executor.get_start_time()
        logging.info("Job executed in %s seconds", makespan)
        dstart = time.time()

        files = my_apply_async_with_waiting(ls,
                                            queue=cluster,
                                            args=(output_path, ))

        tasks = []
        for _file in files:
            tasks.append(
                generate_one_digest.s(
                    _file, medusa_settings.digest_command).set(queue=cluster))
        g1 = group(tasks)()

        while g1.waiting():
            time.sleep(2)

        digests = g1.get()
        # f.append(my_apply_async_without_waiting(
        #     generate_one_digest, queue=cluster, args=(_file, medusa_settings.digest_command,)))

        # digests = {}
        # for _f in tasks:
        #     digests.update(_f.get())

        logging.info("Digests generated in %s seconds", time.time() - dstart)

        # very slow
        # dstart = time.time()
        # digests2 = my_apply_async_with_waiting(generate_digests, queue=cluster, args=(output_path, medusa_settings.digest_command, ))
        # logging.info("Digests2 generated in %s seconds", time.time() - dstart)

        if not "FileAlreadyExistsException" in job_output:
            json_out = getJobJSON(job_output, cluster, queue_info, makespan,
                                  digests)
            json_out = json_out.replace("\n", "").replace("\'", "\"")

            logging.debug("Got result from %s" % cluster)
            json_results.append(json_out)

        if medusa_settings.ranking_scheduler == "prediction":
            f = my_apply_async(load_prediction, queue=cluster)

            value = f.get()
            prediction_value = json.loads(value)["total_time"]
            error = makespan - prediction_value
            penalization_params = set_penalization_params(
                makespan, prediction_value, error)

            logging.info(
                "Job executed in %s seconds; predicted: %s seconds; error %s seconds",
                makespan, prediction_value, error)

            my_apply_async(save_penalization,
                           queue=cluster,
                           args=(json.dumps(
                               penalization_params._asdict()), )).get()

    if queue is not None:
        queue.put(json_results)
        return

    return json_results
Ejemplo n.º 6
0
def copy_data(order, reference_digests):
    """
    Prepares the environment for the copy and actually copies the data between HDFS instances.

    :param order (Order) object with info about the copy. The order object contains:
            from_cluster cluster that contains the data
            to_cluster is the cluster that the data will be copied
            pinput is the source path
            poutput is the dest path where data will be copied
    :param reference_digests (RefDigetsts) object with the reference digests

    :return (string) the cluster that was copied the data
    """

    logging.info("Starting to copy data (%s): %s" % (str(time.time()), str(order)))

    from_cluster = order.from_cluster
    to_cluster = order.to_cluster
    pinput = order.src_path
    poutput = order.dst_path

    if isinstance(pinput, list):
        pinput = ' '.join(pinput)

    if isinstance(poutput, list):
        poutput = ' '.join(poutput)

    f = my_apply_async(getNamenodeAddress, queue=to_cluster)
    toNamenodeAddress = my_get(f)

    if medusa_settings.local_copy:
        port = medusa_settings.httpfs_port if medusa_settings.httpfs_used else medusa_settings.hdfs_port
        fromNamenodeAddress = "%s:%s" % (medusa_settings.namenode_address, port)
    else:
        f = my_apply_async(getNamenodeAddress, queue=from_cluster)
        fromNamenodeAddress = my_get(f)

    # tuple with queues from the cluster of origin and destination
    t = (from_cluster, to_cluster)

    # tuple with namenode address from the cluster of origin and destination
    tn = (fromNamenodeAddress, toNamenodeAddress)

    pzipped = zip(
        pinput,
        poutput) if isinstance(pinput,
                               list) else zip([pinput],
                                              [poutput])

    copy_done = False
    while not copy_done:
        retrylist = []
        for _input, _output in pzipped:
            copydata_prepareEnvironment(t, _input, medusa_settings.local_copy)

            if not copydata_runEnvironment(t, tn, _input, _output, reference_digests):
                # copy failed
                logging.error("Copy failed: %s -> %s" % (from_cluster, to_cluster))
                copydata_rmr(to_cluster, _output)
                retrylist.append((_input, _output))

        if len(retrylist) > 0:
            pzipped = retrylist
        else:
            copy_done = True

    return to_cluster