Example #1
0
 def add_shuffle_key(split, iterator):
     buckets = defaultdict(list)
     for (k, v) in iterator:
         buckets[partitionFunc(k) % numPartitions].append((k, v))
     for (split, items) in buckets.iteritems():
         yield str(split)
         yield dump_pickle(Batch(items))
Example #2
0
 def add_shuffle_key(split, iterator):
     buckets = defaultdict(list)
     for (k, v) in iterator:
         buckets[partitionFunc(k) % numPartitions].append((k, v))
     for (split, items) in buckets.iteritems():
         yield str(split)
         yield dump_pickle(Batch(items))
Example #3
0
def main():
    split_index = read_int(sys.stdin)
    spark_files_dir = load_pickle(read_with_length(sys.stdin))
    SparkFiles._root_directory = spark_files_dir
    SparkFiles._is_running_on_worker = True
    sys.path.append(spark_files_dir)
    num_broadcast_variables = read_int(sys.stdin)
    for _ in range(num_broadcast_variables):
        bid = read_long(sys.stdin)
        value = read_with_length(sys.stdin)
        _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value))
    func = load_obj()
    bypassSerializer = load_obj()
    if bypassSerializer:
        dumps = lambda x: x
    else:
        dumps = dump_pickle
    iterator = read_from_pickle_file(sys.stdin)
    try:
        for obj in func(split_index, iterator):
            write_with_length(dumps(obj), old_stdout)
    except Exception as e:
        write_int(-2, old_stdout)
        write_with_length(traceback.format_exc(), old_stdout)
        sys.exit(-1)
    # Mark the beginning of the accumulators section of the output
    write_int(-1, old_stdout)
    for aid, accum in _accumulatorRegistry.items():
        write_with_length(dump_pickle((aid, accum._value)), old_stdout)
Example #4
0
def main():
    split_index = read_int(sys.stdin)
    spark_files_dir = load_pickle(read_with_length(sys.stdin))
    SparkFiles._root_directory = spark_files_dir
    SparkFiles._is_running_on_worker = True
    sys.path.append(spark_files_dir)
    num_broadcast_variables = read_int(sys.stdin)
    for _ in range(num_broadcast_variables):
        bid = read_long(sys.stdin)
        value = read_with_length(sys.stdin)
        _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value))
    func = load_obj()
    bypassSerializer = load_obj()
    if bypassSerializer:
        dumps = lambda x: x
    else:
        dumps = dump_pickle
    iterator = read_from_pickle_file(sys.stdin)
    try:
        for obj in func(split_index, iterator):
           write_with_length(dumps(obj), old_stdout)
    except Exception as e:
        write_int(-2, old_stdout)
        write_with_length(traceback.format_exc(), old_stdout)
        sys.exit(-1)
    # Mark the beginning of the accumulators section of the output
    write_int(-1, old_stdout)
    for aid, accum in _accumulatorRegistry.items():
        write_with_length(dump_pickle((aid, accum._value)), old_stdout)
Example #5
0
 def broadcast(self, value):
     """
     Broadcast a read-only variable to the cluster, returning a C{Broadcast}
     object for reading it in distributed functions. The variable will be
     sent to each cluster only once.
     """
     jbroadcast = self._jsc.broadcast(bytearray(dump_pickle(value)))
     return Broadcast(jbroadcast.id(), value, jbroadcast,
                      self._pickled_broadcast_vars)
 def broadcast(self, value):
     """
     Broadcast a read-only variable to the cluster, returning a C{Broadcast}
     object for reading it in distributed functions. The variable will be
     sent to each cluster only once.
     """
     jbroadcast = self._jsc.broadcast(bytearray(dump_pickle(value)))
     return Broadcast(jbroadcast.id(), value, jbroadcast,
                      self._pickled_broadcast_vars)
Example #7
0
def main(infile, outfile):
    boot_time = time.time()
    split_index = read_int(infile)
    if split_index == -1:  # for unit tests
        return

    # fetch name of workdir
    spark_files_dir = load_pickle(read_with_length(infile))
    SparkFiles._root_directory = spark_files_dir
    SparkFiles._is_running_on_worker = True

    # fetch names and values of broadcast variables
    num_broadcast_variables = read_int(infile)
    for _ in range(num_broadcast_variables):
        bid = read_long(infile)
        value = read_with_length(infile)
        _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value))

    # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
    sys.path.append(
        spark_files_dir)  # *.py files that were added will be copied here
    num_python_includes = read_int(infile)
    for _ in range(num_python_includes):
        sys.path.append(
            os.path.join(spark_files_dir,
                         load_pickle(read_with_length(infile))))

    # now load function
    func = load_obj(infile)
    bypassSerializer = load_obj(infile)
    if bypassSerializer:
        dumps = lambda x: x
    else:
        dumps = dump_pickle
    init_time = time.time()
    iterator = read_from_pickle_file(infile)
    try:
        for obj in func(split_index, iterator):
            write_with_length(dumps(obj), outfile)
    except Exception as e:
        write_int(-2, outfile)
        write_with_length(traceback.format_exc(), outfile)
        sys.exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    # Mark the beginning of the accumulators section of the output
    write_int(-1, outfile)
    for aid, accum in _accumulatorRegistry.items():
        write_with_length(dump_pickle((aid, accum._value)), outfile)
    write_int(-1, outfile)
Example #8
0
def main(infile, outfile):
    boot_time = time.time()
    split_index = read_int(infile)
    if split_index == -1:  # for unit tests
        return

    # fetch name of workdir
    spark_files_dir = load_pickle(read_with_length(infile))
    SparkFiles._root_directory = spark_files_dir
    SparkFiles._is_running_on_worker = True

    # fetch names and values of broadcast variables
    num_broadcast_variables = read_int(infile)
    for _ in range(num_broadcast_variables):
        bid = read_long(infile)
        value = read_with_length(infile)
        _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value))

    # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
    sys.path.append(spark_files_dir) # *.py files that were added will be copied here
    num_python_includes =  read_int(infile)
    for _ in range(num_python_includes):
        sys.path.append(os.path.join(spark_files_dir, load_pickle(read_with_length(infile))))

    # now load function
    func = load_obj(infile)
    bypassSerializer = load_obj(infile)
    if bypassSerializer:
        dumps = lambda x: x
    else:
        dumps = dump_pickle
    init_time = time.time()
    iterator = read_from_pickle_file(infile)
    try:
        for obj in func(split_index, iterator):
            write_with_length(dumps(obj), outfile)
    except Exception as e:
        write_int(-2, outfile)
        write_with_length(traceback.format_exc(), outfile)
        sys.exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    # Mark the beginning of the accumulators section of the output
    write_int(-1, outfile)
    for aid, accum in _accumulatorRegistry.items():
        write_with_length(dump_pickle((aid, accum._value)), outfile)
    write_int(-1, outfile)
Example #9
0
 def parallelize(self, c, numSlices=None):
     """
     Distribute a local Python collection to form an RDD.
     """
     numSlices = numSlices or self.defaultParallelism
     # Calling the Java parallelize() method with an ArrayList is too slow,
     # because it sends O(n) Py4J commands.  As an alternative, serialized
     # objects are written to a file and loaded through textFile().
     tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
     if self.batchSize != 1:
         c = batched(c, self.batchSize)
     for x in c:
         write_with_length(dump_pickle(x), tempFile)
     tempFile.close()
     readRDDFromPickleFile = self._jvm.PythonRDD.readRDDFromPickleFile
     jrdd = readRDDFromPickleFile(self._jsc, tempFile.name, numSlices)
     return RDD(jrdd, self)
Example #10
0
 def parallelize(self, c, numSlices=None):
     """
     Distribute a local Python collection to form an RDD.
     """
     numSlices = numSlices or self.defaultParallelism
     # Calling the Java parallelize() method with an ArrayList is too slow,
     # because it sends O(n) Py4J commands.  As an alternative, serialized
     # objects are written to a file and loaded through textFile().
     tempFile = NamedTemporaryFile(delete=False)
     atexit.register(lambda: os.unlink(tempFile.name))
     if self.batchSize != 1:
         c = batched(c, self.batchSize)
     for x in c:
         write_with_length(dump_pickle(x), tempFile)
     tempFile.close()
     jrdd = self._readRDDFromPickleFile(self._jsc, tempFile.name, numSlices)
     return RDD(jrdd, self)
Example #11
0
    def parallelize(self, c, numSlices=None):
        """
        Distribute a local Python collection to form an RDD.

        >>> sc.parallelize(range(5), 5).glom().collect()
        [[0], [1], [2], [3], [4]]
        """
        numSlices = numSlices or self.defaultParallelism
        # Calling the Java parallelize() method with an ArrayList is too slow,
        # because it sends O(n) Py4J commands.  As an alternative, serialized
        # objects are written to a file and loaded through textFile().
        tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
        # Make sure we distribute data evenly if it's smaller than self.batchSize
        if "__len__" not in dir(c):
            c = list(c)    # Make it a list so we can compute its length
        batchSize = min(len(c) // numSlices, self.batchSize)
        if batchSize > 1:
            c = batched(c, batchSize)
        for x in c:
            write_with_length(dump_pickle(x), tempFile)
        tempFile.close()
        readRDDFromPickleFile = self._jvm.PythonRDD.readRDDFromPickleFile
        jrdd = readRDDFromPickleFile(self._jsc, tempFile.name, numSlices)
        return RDD(jrdd, self)
Example #12
0
def main(infile, outfile):
    boot_time = time.time()
    split_index = read_int(infile)
    if split_index == -1:  # for unit tests
        return
    spark_files_dir = load_pickle(read_with_length(infile))
    SparkFiles._root_directory = spark_files_dir
    SparkFiles._is_running_on_worker = True
    sys.path.append(spark_files_dir)
    num_broadcast_variables = read_int(infile)
    for _ in range(num_broadcast_variables):
        bid = read_long(infile)
        value = read_with_length(infile)
        _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value))
    func = load_obj(infile)
    bypassSerializer = load_obj(infile)
    if bypassSerializer:
        dumps = lambda x: x
    else:
        dumps = dump_pickle
    init_time = time.time()
    iterator = read_from_pickle_file(infile)
    try:
        for obj in func(split_index, iterator):
            write_with_length(dumps(obj), outfile)
    except Exception as e:
        write_int(-2, outfile)
        write_with_length(traceback.format_exc(), outfile)
        sys.exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    # Mark the beginning of the accumulators section of the output
    write_int(-1, outfile)
    for aid, accum in _accumulatorRegistry.items():
        write_with_length(dump_pickle((aid, accum._value)), outfile)
    write_int(-1, outfile)
Example #13
0
    def parallelize(self, c, numSlices=None):
        """
        Distribute a local Python collection to form an RDD.

        >>> sc.parallelize(range(5), 5).glom().collect()
        [[0], [1], [2], [3], [4]]
        """
        numSlices = numSlices or self.defaultParallelism
        # Calling the Java parallelize() method with an ArrayList is too slow,
        # because it sends O(n) Py4J commands.  As an alternative, serialized
        # objects are written to a file and loaded through textFile().
        tempFile = NamedTemporaryFile(delete=False, dir=self._temp_dir)
        # Make sure we distribute data evenly if it's smaller than self.batchSize
        if "__len__" not in dir(c):
            c = list(c)  # Make it a list so we can compute its length
        batchSize = min(len(c) // numSlices, self.batchSize)
        if batchSize > 1:
            c = batched(c, batchSize)
        for x in c:
            write_with_length(dump_pickle(x), tempFile)
        tempFile.close()
        readRDDFromPickleFile = self._jvm.PythonRDD.readRDDFromPickleFile
        jrdd = readRDDFromPickleFile(self._jsc, tempFile.name, numSlices)
        return RDD(jrdd, self)
Example #14
0
def main(infile, outfile):
    boot_time = time.time()
    split_index = read_int(infile)
    if split_index == -1:  # for unit tests
        return
    spark_files_dir = load_pickle(read_with_length(infile))
    SparkFiles._root_directory = spark_files_dir
    SparkFiles._is_running_on_worker = True
    sys.path.append(spark_files_dir)
    num_broadcast_variables = read_int(infile)
    for _ in range(num_broadcast_variables):
        bid = read_long(infile)
        value = read_with_length(infile)
        _broadcastRegistry[bid] = Broadcast(bid, load_pickle(value))
    func = load_obj(infile)
    bypassSerializer = load_obj(infile)
    if bypassSerializer:
        dumps = lambda x: x
    else:
        dumps = dump_pickle
    init_time = time.time()
    iterator = read_from_pickle_file(infile)
    try:
        for obj in func(split_index, iterator):
            write_with_length(dumps(obj), outfile)
    except Exception as e:
        write_int(-2, outfile)
        write_with_length(traceback.format_exc(), outfile)
        sys.exit(-1)
    finish_time = time.time()
    report_times(outfile, boot_time, init_time, finish_time)
    # Mark the beginning of the accumulators section of the output
    write_int(-1, outfile)
    for aid, accum in _accumulatorRegistry.items():
        write_with_length(dump_pickle((aid, accum._value)), outfile)
    write_int(-1, outfile)