def process_spark_partitions(partition):
    ctx = TaskContext()
    nltk.download('punkt')
    logger.info("start_processing_partition partitionId=" +
                str(ctx.partitionId()))
    all_records = []
    for entry in partition:
        if entry["From"] is not None:
            all_records.extend(process_line_spark(entry))

    final_dict = dict()
    for key, rec in all_records:
        if key not in final_dict:
            final_dict[key] = rec
            continue
        current_rec = final_dict[key]
        final_dict[key] = reduceByKeyAndCombine(current_rec, rec)

    all_final_records = []
    for key, rec in final_dict.items():
        all_final_records.append(((key, 1), rec))
    logger.info(
        f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_final_records)} records"
    )
    return all_final_records
Beispiel #2
0
    def _process_data(filename_list):
        filename_set = set()
        for filename in filename_list:
            filename_set.add(filename)

        tar = tarfile.open(training_data_tar_file)
        tar_info_list = tar.getmembers()
        filename_to_object = {}
        for tar_info in tar_info_list:
            if tar_info.name in filename_set:
                f = tar.extractfile(tar_info)
                assert f is not None
                filename_to_object[tar_info.name] = f

        partition = TaskContext().partitionId()
        counter = 0
        data_list = []
        for filename in glob.glob(output_dir + "/data-%s*" % partition):
            os.remove(filename)
        for filename in filename_set:
            data = single_file_preparation_func(filename_to_object[filename],
                                                filename)
            data_list.append(data)
            if len(data_list) == records_per_file:
                filename = output_dir + "/data-%s-%04d" % (partition, counter)
                counter += 1

                write_to_recordio(filename, data_list)
                data_list.clear()

        if data_list:
            filename = output_dir + "/data-%s-%04d" % (partition, counter)
            write_to_recordio(filename, data_list)
        return filename_list
Beispiel #3
0
def task_info(*_):
    ctx = TaskContext()
    return [
        "Stage: {0}, Partition: {1}, Host: {2}".format(ctx.stageId(),
                                                       ctx.partitionId(),
                                                       socket.gethostname())
    ]
def process_spark_partitions(partition):
    """

    :param partition:
    :type partition:
    :return:
    :rtype:
    """
    ctx = TaskContext()
    logger.info("start_processing_partition partitionId=" + str(ctx.partitionId()))

    big_taxo = TaxonomyWrapper.get(args, SERVICE_PRINCIPAL_SECRET, logger)
    gensim_model = GensimMagic.get(args, SERVICE_PRINCIPAL_SECRET, logger)  # move this to process_partitions
    de_vocab = gensim_model["vocab"]  # move this to process_partitions
    de_model = gensim_model["model"]  # move this to process_partitions

    words_list = set(de_vocab.keys())
    for domain, domain_dict in big_taxo.items():
        words_list = words_list.union(set(domain_dict.keys()))

    all_records = []
    for entry in partition:
        all_records.extend(process_line_spark(entry, big_taxo, de_model, de_vocab, words_list))
    logger.info(f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_records)} records")
    return all_records
Beispiel #5
0
 def test_resources(self):
     """Test the resources are empty by default."""
     rdd = self.sc.parallelize(range(10))
     resources1 = rdd.map(lambda x: TaskContext.get().resources()).take(1)[0]
     # Test using the constructor directly rather than the get()
     resources2 = rdd.map(lambda x: TaskContext().resources()).take(1)[0]
     self.assertEqual(len(resources1), 0)
     self.assertEqual(len(resources2), 0)
Beispiel #6
0
 def test_stage_id(self):
     """Test the stage ids are available and incrementing as expected."""
     rdd = self.sc.parallelize(range(10))
     stage1 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
     stage2 = rdd.map(lambda x: TaskContext.get().stageId()).take(1)[0]
     # Test using the constructor directly rather than the get()
     stage3 = rdd.map(lambda x: TaskContext().stageId()).take(1)[0]
     self.assertEqual(stage1 + 1, stage2)
     self.assertEqual(stage1 + 2, stage3)
     self.assertEqual(stage2 + 1, stage3)
def process_spark_partitions(partition):
    """
    :param partition:
    :type partition:
    :return:
    :rtype:
    """
    ctx = TaskContext()
    logger.info("start_processing_partitionId=" + str(ctx.partitionId()))
    all_records = []
    for entry in partition:
        all_records.extend(process_line_spark(entry))
    logger.info(
        f"end_processing_partition partitionId={str(ctx.partitionId())}. processed: {len(all_records)} records"
    )
    return all_records
Beispiel #8
0
def save_spark_pandas_to_parquet(output, out_dir):
    from pyspark import TaskContext

    ctx = TaskContext()
    name = f"part_{ctx.partitionId()}"
    # print("Stage: {0}, Partition: {1}, Host: {2}".format(
    #     ctx.stageId(), ctx.partitionId(), socket.gethostname()))

    for ds in output.dataset.unique():
        df = output[output.dataset == ds]
        if df.shape[0] == 0:
            return
        mkdir(f"{out_dir}/{ds}")
        path = f"{out_dir}/{ds}/{name}.parquet"
        df.to_parquet(path=path)
        print(f"Saved to {path}")
Beispiel #9
0
    def detect_objects(iterator):

        print("detecting objects in the image..")
        ctx = TaskContext()
        partition_id = ctx.partitionId()

        from app.model_pool import ModelPool
        model = ModelPool.get_model(model_name)
        print("partition_id : ", partition_id, "  model : ", model)

        for img_row in iterator:

            row_dict = img_row.asDict()
            filepath = row_dict['origin']
            height = row_dict['height']
            width = row_dict['width']
            nChannels = row_dict['nChannels']

            import os
            filename = os.path.basename(filepath)
            print("filename", filename)

            data = row_dict['data']
            shape = (height, width, nChannels)
            image_np_array = np.ndarray(shape, np.uint8, data)
            resized_image, scale = resize_image(image_np_array)

            boxes, scores, labels = model.predict_on_batch(
                np.expand_dims(resized_image, axis=0))

            for box, score, label in zip(boxes[0], scores[0], labels[0]):
                # scores are sorted so we can break

                if score < float("0.5"):
                    break
                color = label_color(label)
                b = box.astype(int)
                draw_box(resized_image, b, color=color)
                caption = "{} {:.3f}".format(labels_to_names[label], score)
                draw_caption(resized_image, b, caption)

            cv2.imwrite(output_images_dir + "/" + filename, resized_image)
Beispiel #10
0
    def _mapfn(iter):
        import pyspark

        # Note: consuming the input iterator helps Pyspark re-use this worker,
        for i in iter:
            executor_id = i

        # check that there are enough available GPUs (if using tensorflow-gpu) before committing reservation on this node
        # note: for Spark 3+ w/ GPU allocation, the required number of GPUs should be guaranteed by the resource manager
        if version.parse(pyspark.__version__).base_version < version.parse(
                '3.0.0').base_version:
            if gpu_info.is_gpu_available():
                num_gpus = tf_args.num_gpus if 'num_gpus' in tf_args else 1
                gpus_to_use = gpu_info.get_gpus(num_gpus)

        # assign TF job/task based on provided cluster_spec template (or use default/null values)
        job_name = 'default'
        task_index = -1
        cluster_id = cluster_meta['id']
        cluster_template = cluster_meta['cluster_template']
        for jobtype in cluster_template:
            nodes = cluster_template[jobtype]
            if executor_id in nodes:
                job_name = jobtype
                task_index = nodes.index(executor_id)
                break

        # get unique key (hostname, executor_id) for this executor
        host = util.get_ip_address()
        util.write_executor_id(executor_id)
        port = 0

        # check for existing TFManagers
        if TFSparkNode.mgr is not None and str(
                TFSparkNode.mgr.get('state')) != "'stopped'":
            if TFSparkNode.cluster_id == cluster_id:
                # raise an exception to force Spark to retry this "reservation" task on another executor
                raise Exception(
                    "TFManager already started on {0}, executor={1}, state={2}"
                    .format(host, executor_id,
                            str(TFSparkNode.mgr.get("state"))))
            else:
                # old state, just continue with creating new manager
                logger.warn(
                    "Ignoring old TFManager with cluster_id {0}, requested cluster_id {1}"
                    .format(TFSparkNode.cluster_id, cluster_id))

        # start a TFManager and get a free port
        # use a random uuid as the authkey
        authkey = uuid.uuid4().bytes
        addr = None
        if job_name in ('ps', 'evaluator'):
            # PS nodes must be remotely accessible in order to shutdown from Spark driver.
            TFSparkNode.mgr = TFManager.start(authkey, ['control', 'error'],
                                              'remote')
            addr = (host, TFSparkNode.mgr.address[1])
        else:
            # worker nodes only need to be locally accessible within the executor for data feeding
            TFSparkNode.mgr = TFManager.start(authkey, queues)
            addr = TFSparkNode.mgr.address

        # initialize mgr state
        TFSparkNode.mgr.set('state', 'running')
        TFSparkNode.cluster_id = cluster_id

        # expand Hadoop classpath wildcards for JNI (Spark 2.x)
        if 'HADOOP_PREFIX' in os.environ:
            classpath = os.environ['CLASSPATH']
            hadoop_path = os.path.join(os.environ['HADOOP_PREFIX'], 'bin',
                                       'hadoop')
            hadoop_classpath = subprocess.check_output(
                [hadoop_path, 'classpath', '--glob']).decode()
            logger.debug("CLASSPATH: {0}".format(hadoop_classpath))
            os.environ['CLASSPATH'] = classpath + os.pathsep + hadoop_classpath

        # start TensorBoard if requested, on 'worker:0' if available (for backwards-compatibility), otherwise on 'chief:0' or 'master:0'
        job_names = sorted([
            k for k in cluster_template.keys()
            if k in ['chief', 'master', 'worker']
        ])
        tb_job_name = 'worker' if 'worker' in job_names else job_names[0]
        tb_pid = 0
        tb_port = 0
        if tensorboard and job_name == tb_job_name and task_index == 0:
            tb_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tb_sock.bind(('', 0))
            tb_port = tb_sock.getsockname()[1]
            tb_sock.close()
            logdir = log_dir if log_dir else "tensorboard_%d" % executor_id

            # search for tensorboard in python/bin, PATH, and PYTHONPATH
            pypath = sys.executable
            pydir = os.path.dirname(pypath)
            sys_path = os.pathsep.join(sys.path)
            search_path = os.pathsep.join([
                pydir, sys_path, os.environ['PATH'], os.environ['PYTHONPATH']
            ])

            tb_path = util.find_in_path(search_path,
                                        'tensorboard')  # executable in PATH
            if not tb_path:
                tb_path = util.find_in_path(search_path,
                                            'tensorboard/main.py')  # TF 1.3+
            if not tb_path:
                tb_path = util.find_in_path(
                    search_path,
                    'tensorflow/tensorboard/__main__.py')  # TF 1.2-
            if not tb_path:
                raise Exception(
                    "Unable to find 'tensorboard' in: {}".format(search_path))

            # launch tensorboard
            if version.parse(TF_VERSION) >= version.parse('2.0.0'):
                tb_proc = subprocess.Popen([
                    pypath, tb_path, "--reload_multifile=True",
                    "--logdir=%s" % logdir,
                    "--port=%d" % tb_port
                ],
                                           env=os.environ)
            else:
                tb_proc = subprocess.Popen([
                    pypath, tb_path,
                    "--logdir=%s" % logdir,
                    "--port=%d" % tb_port
                ],
                                           env=os.environ)

            tb_pid = tb_proc.pid

        # check server to see if this task is being retried (i.e. already reserved)
        client = reservation.Client(cluster_meta['server_addr'])
        cluster_info = client.get_reservations()
        tmp_sock = None
        node_meta = None
        for node in cluster_info:
            (nhost, nexec) = (node['host'], node['executor_id'])
            if nhost == host and nexec == executor_id:
                node_meta = node
                port = node['port']

        # if not already done, register everything we need to set up the cluster
        if node_meta is None:
            # first, find a free port for TF
            tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            tmp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
            tmp_sock.bind(('', port))
            port = tmp_sock.getsockname()[1]

            node_meta = {
                'executor_id': executor_id,
                'host': host,
                'job_name': job_name,
                'task_index': task_index,
                'port': port,
                'tb_pid': tb_pid,
                'tb_port': tb_port,
                'addr': addr,
                'authkey': authkey
            }
            # register node metadata with server
            logger.info("TFSparkNode.reserve: {0}".format(node_meta))
            client.register(node_meta)
            # wait for other nodes to finish reservations
            cluster_info = client.await_reservations()
            client.close()

        # construct a TensorFlow clusterspec from cluster_info
        sorted_cluster_info = sorted(cluster_info,
                                     key=lambda k: k['executor_id'])
        cluster_spec = {}
        last_executor_id = -1
        for node in sorted_cluster_info:
            if (node['executor_id'] == last_executor_id):
                raise Exception("Duplicate worker/task in cluster_info")
            last_executor_id = node['executor_id']
            logger.info("node: {0}".format(node))
            (njob, nhost, nport) = (node['job_name'], node['host'],
                                    node['port'])
            hosts = [] if njob not in cluster_spec else cluster_spec[njob]
            hosts.append("{0}:{1}".format(nhost, nport))
            cluster_spec[njob] = hosts

        # update TF_CONFIG if cluster spec has a 'master' node (i.e. tf.estimator)
        if 'master' in cluster_spec or 'chief' in cluster_spec:
            tf_config = json.dumps({
                'cluster': cluster_spec,
                'task': {
                    'type': job_name,
                    'index': task_index
                },
                'environment': 'cloud'
            })
            logger.info("export TF_CONFIG: {}".format(tf_config))
            os.environ['TF_CONFIG'] = tf_config

        # reserve GPU(s) again, just before launching TF process (in case situation has changed)
        # and setup CUDA_VISIBLE_DEVICES accordingly
        if gpu_info.is_gpu_available():

            gpus_to_use = None
            # For Spark 3+, try to get GPU resources from TaskContext first
            if version.parse(
                    pyspark.__version__).base_version >= version.parse(
                        "3.0.0").base_version:
                from pyspark import TaskContext
                context = TaskContext()
                if 'gpu' in context.resources():
                    # use ALL GPUs assigned by resource manager
                    gpus = context.resources()['gpu'].addresses
                    num_gpus = len(gpus)
                    gpus_to_use = ','.join(gpus)

            if not gpus_to_use:
                # compute my index relative to other nodes on the same host (for GPU allocation)
                my_addr = cluster_spec[job_name][task_index]
                my_host = my_addr.split(':')[0]
                flattened = [
                    v for sublist in cluster_spec.values() for v in sublist
                ]
                local_peers = [p for p in flattened if p.startswith(my_host)]
                my_index = local_peers.index(my_addr)

                # default to one GPU if not specified explicitly
                num_gpus = tf_args.num_gpus if 'num_gpus' in tf_args else 1
                gpus_to_use = gpu_info.get_gpus(num_gpus, my_index)

            gpu_str = "GPUs" if num_gpus > 1 else "GPU"
            logger.info(
                "Requested {} {}, setting CUDA_VISIBLE_DEVICES={}".format(
                    num_gpus, gpu_str, gpus_to_use))
            os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use

        # create a context object to hold metadata for TF
        ctx = TFNodeContext(executor_id, job_name, task_index, cluster_spec,
                            cluster_meta['default_fs'],
                            cluster_meta['working_dir'], TFSparkNode.mgr)

        # release port reserved for TF as late as possible
        if tmp_sock is not None:
            tmp_sock.close()

        # Background mode relies reuse of python worker in Spark.
        if background:
            # However, reuse of python worker can't work on Windows, we need to check if the current
            # script runs on Windows or not.
            if os.name == 'nt' or platform.system() == 'Windows':
                raise Exception("Background mode is not supported on Windows.")
            # Check if the config of reuse python worker is enabled on Spark.
            if not os.environ.get("SPARK_REUSE_WORKER"):
                raise Exception(
                    "Background mode relies reuse of python worker on Spark. This config 'spark.python.worker.reuse' is not enabled on Spark. Please enable it before using background."
                )

        def wrapper_fn(args, context):
            """Wrapper function that sets the sys.argv of the executor."""
            if isinstance(args, list):
                sys.argv = args
            fn(args, context)

        def wrapper_fn_background(args, context):
            """Wrapper function that signals exceptions to foreground process."""
            errq = TFSparkNode.mgr.get_queue('error')
            try:
                wrapper_fn(args, context)
            except Exception:
                errq.put(traceback.format_exc())

        if job_name in ('ps', 'evaluator') or background:
            # invoke the TensorFlow main function in a background thread
            logger.info(
                "Starting TensorFlow {0}:{1} as {2} on cluster node {3} on background process"
                .format(job_name, task_index, job_name, executor_id))

            p = multiprocessing.Process(target=wrapper_fn_background,
                                        args=(tf_args, ctx))
            if job_name in ('ps', 'evaluator'):
                p.daemon = True
            p.start()

            # for ps and evaluator nodes, wait indefinitely in foreground thread for a "control" event (None == "stop")
            if job_name in ('ps', 'evaluator'):
                queue = TFSparkNode.mgr.get_queue('control')
                equeue = TFSparkNode.mgr.get_queue('error')
                done = False
                while not done:
                    while (queue.empty() and equeue.empty()):
                        time.sleep(1)
                    if (not equeue.empty()):
                        e_str = equeue.get()
                        raise Exception("Exception in " + job_name + ":\n" +
                                        e_str)
                    msg = queue.get(block=True)
                    logger.info("Got msg: {0}".format(msg))
                    if msg is None:
                        logger.info("Terminating {}".format(job_name))
                        TFSparkNode.mgr.set('state', 'stopped')
                        done = True
                    queue.task_done()
        else:
            # otherwise, just run TF function in the main executor/worker thread
            logger.info(
                "Starting TensorFlow {0}:{1} on cluster node {2} on foreground thread"
                .format(job_name, task_index, executor_id))
            wrapper_fn(tf_args, ctx)
            logger.info(
                "Finished TensorFlow {0}:{1} on cluster node {2}".format(
                    job_name, task_index, executor_id))
Beispiel #11
0
        def _get_gpus(cluster_spec=None):
            gpus = []
            is_k8s = 'SPARK_EXECUTOR_POD_IP' in os.environ

            # handle explicitly configured tf_args.num_gpus
            if 'num_gpus' in tf_args:
                requested_gpus = tf_args.num_gpus
                user_requested = True
            else:
                requested_gpus = 0
                user_requested = False

            # first, try Spark 3 resources API, returning all visible GPUs
            # note: num_gpus arg is only used (if supplied) to limit/truncate visible devices
            if _has_spark_resource_api():
                from pyspark import TaskContext
                context = TaskContext()
                resources = context.resources()
                if resources and 'gpu' in resources:
                    # get all GPUs assigned by resource manager
                    gpus = context.resources()['gpu'].addresses
                    logger.info("Spark gpu resources: {}".format(gpus))
                    if user_requested:
                        if requested_gpus < len(gpus):
                            # override/truncate list, if explicitly configured
                            logger.warn(
                                "Requested {} GPU(s), but {} available".format(
                                    requested_gpus, len(gpus)))
                            gpus = gpus[:requested_gpus]
                    else:
                        # implicitly requested by Spark 3
                        requested_gpus = len(gpus)

            # if not in K8s pod and GPUs available, just use original allocation code (defaulting to 1 GPU if available)
            # Note: for K8s, there is a bug with the Nvidia device_plugin which can show GPUs for non-GPU pods
            # that are hosted on GPU nodes
            if not is_k8s and gpu_info.is_gpu_available() and not gpus:
                # default to one GPU if not specified explicitly
                requested_gpus = max(
                    1,
                    requested_gpus) if not user_requested else requested_gpus
                if requested_gpus > 0:
                    if cluster_spec:
                        # compute my index relative to other nodes on the same host (for GPU allocation)
                        my_addr = cluster_spec[job_name][task_index]
                        my_host = my_addr.split(':')[0]
                        flattened = [
                            v for sublist in cluster_spec.values()
                            for v in sublist
                        ]
                        local_peers = [
                            p for p in flattened if p.startswith(my_host)
                        ]
                        my_index = local_peers.index(my_addr)
                    else:
                        my_index = 0

                    # try to allocate a GPU
                    gpus = gpu_info.get_gpus(requested_gpus,
                                             my_index,
                                             format=gpu_info.AS_LIST)

            if user_requested and len(gpus) < requested_gpus:
                raise Exception(
                    "Unable to allocate {} GPU(s) from available GPUs: {}".
                    format(requested_gpus, gpus))

            gpus_to_use = ','.join(gpus)
            if gpus:
                logger.info(
                    "Requested {} GPU(s), setting CUDA_VISIBLE_DEVICES={}".
                    format(requested_gpus if user_requested else len(gpus),
                           gpus_to_use))
            os.environ['CUDA_VISIBLE_DEVICES'] = gpus_to_use
from pyspark import SparkContext
from pyspark import TaskContext

if __name__ == '__main__':
    sc = SparkContext()
    tc = TaskContext()

    rdd = sc.parallelize(["这", "是", "一", "首", "简", "单", "的", "小", "情", "歌"], 3)

    # 与map类似,map是作用于每个元素,而 mapPartitions 是作用于每个分区
    # mapPatririons 的函数参数和返回值的类型都应该是 iterator
    def f(iter):
        yield "".join(iter) + str(tc.partitionId())

    mapPartitions_rdd = rdd.mapPartitions(f)

    print(mapPartitions_rdd.collect())  # ['这是一0', '首简单1', '的小情歌2']